Hi @ZacharyProser, thank you for responding. This is our coding, and I’m having trouble getting in retrieving and I want the overall idea to be able to manage a big dataset.
import streamlit as st
import pandas as pd
import openai
import requests
import certifi
import tiktoken
from tiktoken import get_encoding
from pinecone import Pinecone, ServerlessSpec
Configuration
OPENAI_API_KEY = “”
PINECONE_API_KEY = “”
INDEX_NAME = “”
BATCH_SIZE = 10
Initialize OpenAI
openai.api_key = OPENAI_API_KEY
Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
Check if the index already exists before creating it
if INDEX_NAME not in pc.list_indexes().names():
pc.create_index(
name=INDEX_NAME,
dimension=1536,
metric=‘cosine’,
spec=ServerlessSpec(cloud=‘aws’, region=‘us-east-1’)
)
index = pc.Index(INDEX_NAME)
Define helper functions
def truncate_text(text, max_tokens):
tokenizer = get_encoding(“gpt2”)
tokens = tokenizer.encode(text)
return tokenizer.decode(tokens[:max_tokens])
def generate_embedding(text):
headers = {
‘Authorization’: f’Bearer {OPENAI_API_KEY}',
‘Content-Type’: ‘application/json’,
}
data = {
‘model’: ‘text-embedding-ada-002’,
‘input’: text
}
response = requests.post(
‘https://api.openai.com/v1/embeddings’,
headers=headers,
json=data,
verify=certifi.where()
)
if response.status_code == 200:
result = response.json()
tokens_used = result[‘usage’][‘total_tokens’]
return result[‘data’][0][‘embedding’], tokens_used
else:
st.error(f"Error creating embedding: {response.status_code} - {response.text}")
return None, 0
def upsert_in_batches(index, vectors, batch_size=10):
batches = [vectors[i:i + batch_size] for i in range(0, len(vectors), batch_size)]
for batch in batches:
try:
index.upsert(vectors=batch, namespace=“ns1”)
except Exception as e:
st.error(f"Error upserting batch: {e}")
def num_tokens_from_string(string: str, encoding_name: str = “cl100k_base”) → int:
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
Streamlit UI for file upload
st.title(“Excel File Upload and Processing”)
uploaded_file = st.file_uploader(“Upload an Excel file”, type=[“xlsx”])
if uploaded_file is not None:
st.write(“File uploaded successfully!”)
# Load Excel file
df = pd.read_excel(uploaded_file)
st.write(“Excel file loaded:”)
st.write(df.head())
# Concatenate text from all columns for each row into a readable sentence
def create_meaningful_sentence(row):
return '. '.join([f"{col.replace('_', ' ')} {row[col]}" for col in df.columns])
df['combined_text'] = df.apply(create_meaningful_sentence, axis=1)
st.write("Columns concatenated into meaningful sentences:")
st.write(df[['combined_text']].head())
vectors = []
# Process each row in the DataFrame
total_tokens_used = 0
total_requests = 0
for i, row in df.iterrows():
text = row['combined_text']
# Truncate text to fit within the model's maximum context length
text = truncate_text(text, max_tokens=8192)
embedding, tokens_used = generate_embedding(text)
if embedding is not None:
total_tokens_used += tokens_used
total_requests += 1
# Prepare metadata with handling NaN values and converting to string
metadata = {
"ID": str(row['ID']), # Ensure ID is converted to string if needed
"eventDtgTime": row['eventDtgTime'],
"alerts": str(row.get('alerts', '')),
"displayTitle": row['displayTitle'],
"instantAnalytics": str(row.get('instantAnalytics', '')),
"detailedText": str(row.get('detailedText', '')),
"msgPrecs": str(row.get('msgPrecs', '')),
"unit": str(row.get('unit', '')),
"size": str(row.get('size', '')),
"embedHtml": str(row.get('embedHtml', '')),
"dataSources": str(row.get('dataSources', '')),
"snippetText": str(row.get('snippetText', '')),
"contentLink": str(row.get('contentLink', '')),
"description": str(row.get('description', '')),
"imageDescription": str(row.get('imageDescription', '')),
"reportSummary": str(row.get('reportSummary', '')),
"authorName": str(row.get('authorName', '')),
"timeReportCompleted": str(row.get('timeReportCompleted', '')),
"attachment": str(row.get('attachment', '')),
"latitude": str(row.get('latitude', '')),
"securityLevels": str(row.get('securityLevels', '')),
"imagereSourceLink": str(row.get('imagereSourceLink', '')),
"eventDtg": str(row.get('eventDtg', '')),
"status": str(row.get('status', '')),
"users": str(row.get('users', '')),
"name": str(row.get('name', '')),
"sessions": str(row.get('sessions', '')),
"fiscalStatus": str(row.get('fiscalStatus', '')),
"sentimentSummary": str(row.get('sentimentSummary', '')),
"sourceOrg": str(row.get('sourceOrg', '')),
"dateCreated": str(row.get('dateCreated', '')),
"active": str(row.get('active', '')),
"responseSummary": str(row.get('responseSummary', '')),
"comparisonCommunitiesCountries": str(row.get('comparisonCommunitiesCountries', '')),
"activity": str(row.get('activity', '')),
"applications": str(row.get('applications', '')),
"url": str(row.get('url', '')),
"timeZones": str(row.get('timeZones', '')),
"location": str(row.get('location', '')),
"longitude": str(row.get('longitude', '')),
"dateModified": str(row.get('dateModified', '')),
"pedigrees": str(row.get('pedigrees', '')),
"gistComment": str(row.get('gistComment', '')),
"tag": str(row.get('tag', '')),
"geoCode": str(row.get('geoCode', '')),
"time": str(row.get('time', '')),
"timeReportRouted": str(row.get('timeReportRouted', '')),
"rteToOrg": str(row.get('rteToOrg', '')),
"copyReportToOrg": str(row.get('copyReportToOrg', '')),
"sourceOrganization": str(row.get('sourceOrganization', '')),
"coordinates": str(row.get('coordinates', '')),
"image1": str(row.get('image1', '')),
"image2": str(row.get('image2', '')),
"image3": str(row.get('image3', '')),
"image4": str(row.get('image4', '')),
"image5": str(row.get('image5', '')),
"numEmailsSent": str(row.get('numEmailsSent', '')),
"lastEmailDate": str(row.get('lastEmailDate', '')),
"reportDtg": str(row.get('reportDtg', '')),
"metadata": str(row.get('metadata', '')),
"eventOrganizations": str(row.get('eventOrganizations', '')),
"classification": str(row.get('classification', '')),
"assetIPs": str(row.get('assetIPs', '')),
"sitrepTemplate": str(row.get('sitrepTemplate', '')),
"industry": str(row.get('industry', '')),
"networkSegmentList": str(row.get('networkSegmentList', '')),
"approvedDate": str(row.get('approvedDate', '')),
"incident": str(row.get('incident', '')),
"sendEmail": str(row.get('sendEmail', '')),
"newFormat": row['newFormat'],
"duMapping": str(row.get('duMapping', '')),
"jsonTag": str(row.get('jsonTag', '')),
"createdFrom": str(row.get('createdFrom', '')),
"integrationData": str(row.get('integrationData', '')),
"mtti": str(row.get('mtti', '')),
"mttd": str(row.get('mttd', '')),
"mttr": str(row.get('mttr', '')),
"oldEventDate": str(row.get('oldEventDate', '')),
"org_event_name": row['org_event_name'],
"combined_text": text # Add combined text to metadata
}
vectors.append({'id': str(row['ID']), 'values': embedding, 'metadata': metadata})
if vectors:
upsert_in_batches(index, vectors, BATCH_SIZE)
st.success(f"Data successfully uploaded to Pinecone.")
st.info(f"Total tokens used: {total_tokens_used}")
st.info(f"Total requests made: {total_requests}")
else:
st.warning("No embeddings were generated.")
Query input and response
st.title(“Query the Uploaded Data”)
query = st.text_input(“Enter your query:”)
if query:
query_embedding, _ = generate_embedding(query)
if query_embedding is not None:
results = index.query(
namespace=“ns1”,
vector=query_embedding,
top_k=3,
include_metadata=True
)
relevant_info =
for match in results[‘matches’]:
metadata = match[‘metadata’]
entry = {
“ID”: match[‘id’],
“Score”: match[‘score’],
“Event Date/Time”: metadata.get(‘eventDtgTime’, ‘N/A’),
“Display Title”: metadata.get(‘displayTitle’, ‘N/A’),
“Detailed Text”: metadata.get(‘detailedText’, ‘N/A’),
“Unit”: metadata.get(‘unit’, ‘N/A’),
“Organization Event Name”: metadata.get(‘org_event_name’, ‘N/A’),
“Combined Text”: metadata.get(‘combined_text’, ‘N/A’)
# Add more fields as needed
}
relevant_info.append(entry)
relevant_info.sort(key=lambda x: x['Score'], reverse=True)
# Prepare context for GPT, limiting total tokens
context = ""
system_prompt = "You are a helpful assistant with expertise in analyzing SITREP data."
user_prompt_template = f"Based on the following information from our database, please answer the query: '{query}'\n\nRelevant Information:\n{{context}}\n\nPlease provide a clear and concise answer, citing specific details from the relevant information when applicable. If the information to answer the query is not available in the provided context, please state that clearly."
max_tokens = 16000 # Leave some room for the response
current_tokens = num_tokens_from_string(system_prompt) + num_tokens_from_string(user_prompt_template.format(context=""))
for entry in relevant_info:
entry_text = f"ID: {entry['ID']}\n"
entry_text += f"Event Date/Time: {entry['Event Date/Time']}\n"
entry_text += f"Display Title: {entry['Display Title']}\n"
entry_text += f"Detailed Text: {entry['Detailed Text']}\n"
entry_text += f"Unit: {entry['Unit']}\n"
entry_text += f"Organization Event Name: {entry['Organization Event Name']}\n"
entry_text += f"Combined Text: {entry['Combined Text']}\n"
entry_text += "---\n"
if current_tokens + num_tokens_from_string(entry_text) > max_tokens:
break
context += entry_text
current_tokens += num_tokens_from_string(entry_text)
# Prepare the prompt for GPT
user_prompt = user_prompt_template.format(context=context)
# Generate response using GPT
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
max_tokens=500,
temperature=0.7
)
# Display only the answer
answer = response.choices[0].message.content
st.write("Answer to your query:")
st.write(answer)
else:
st.error("Failed to generate query embedding.")