I am trying to upsert Freshdesk ticket data into Pinecone and then query that data. Despite trying multiple approaches, I’m still struggling with what seems to be a simple task. I’d appreciate any guidance on what might be going wrong.
Upserting Data
I have the following code to upsert Freshdesk ticket data into Pinecone:
import os
import json
import pinecone
from sentence_transformers import SentenceTransformer
import numpy as np
import logging
import time
from requests.exceptions import SSLError
logging.basicConfig(level=logging.INFO)
def initialize_pinecone(api_key, environment, index_name, dimension):
pinecone.init(api_key=api_key, environment=environment)
if index_name not in pinecone.list_indexes():
pinecone.create_index(index_name, dimension=dimension)
def load_model(model_name, device='cuda'):
model = SentenceTransformer(model_name)
return model.to(device)
def upsert_batches(index, vectors, index_name, batch_size=100, max_retries=5, sleep_time=0.5):
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i + batch_size]
retries = 0
while retries <= max_retries:
try:
index.upsert(batch, namespace=index_name)
logging.info(f"Upserted batch {i // batch_size + 1} of {len(vectors) // batch_size}")
break # Success, exit the retry loop
except SSLError as e:
retries += 1
logging.warning(f"SSL error on batch {i // batch_size + 1}. Retrying {retries}/{max_retries}. Error: {e}")
time.sleep(sleep_time * (2 ** retries)) # Exponential backoff
else:
logging.error(f"Max retries exceeded for batch {i // batch_size + 1}. Skipping.")
time.sleep(sleep_time) # Sleep between batches
def process_tickets(directory, text_model, meta_model, index_name):
ticket_files = os.listdir(directory)
vectors = []
for file in ticket_files:
file_path = os.path.join(directory, file)
try:
with open(file_path) as f:
data = f.read().replace('\u0000', '')
tickets = json.loads(data)
text_fields = [t['helpdesk_ticket']['description'] for t in tickets]
meta_fields = [t['helpdesk_ticket']['subject'] for t in tickets]
# Encode into embeddings
text_embeds = np.asarray(text_model.encode(text_fields)).tolist()
meta_embeds = np.asarray(meta_model.encode(meta_fields)).tolist()
# Prepare vectors
for text_embed, meta_embed, ticket in zip(text_embeds, meta_embeds, tickets):
vectors.extend([
{'id': str(ticket['helpdesk_ticket']['id']) + '_text', 'values': text_embed},
{'id': str(ticket['helpdesk_ticket']['id']) + '_meta', 'values': meta_embed}
])
except Exception as e:
line, column = e.lineno, e.colno
logging.error(f"JSON parse error at line {line} column {column}")
index = pinecone.Index(index_name)
upsert_batches(index, vectors, index_name) # Replace the original upsert call with this
API_KEY = "KEY"
ENVIRONMENT = "ENV"
INDEX_NAME = 'index'
DIMENSION = 768
initialize_pinecone(API_KEY, ENVIRONMENT, INDEX_NAME, DIMENSION)
text_model = load_model('multi-qa-mpnet-base-cos-v1')
meta_model = load_model('multi-qa-mpnet-base-cos-v1')
process_tickets('drive/MyDrive/ticketsall', text_model, meta_model, INDEX_NAME)
Querying Data
Here is the code to query the ticket data:
import os
import pinecone
import logging
from sentence_transformers import SentenceTransformer
from langchain import PromptTemplate, LLMChain
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
logging.basicConfig(level=logging.INFO)
def initialize_models():
query_model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')
return query_model
def format_ticket_context(fetched_tickets):
# Assuming fetched_tickets contains the required ticket information
return '\n'.join([f"Ticket ID: {t['id']}\nDescription: {t['description']}\nSubject: {t['subject']}" for t in fetched_tickets])
def query_tickets(question, index_name):
question_embed = query_model.encode([question])
results = index.query(queries=question_embed.tolist(), top_k=5, namespace=index_name)
logging.info("Query results:", results)
if results.get('matches'):
ticket_ids = [r['id'].split('_')[0] for r in results['matches']]
fetched = index.fetch(ids=ticket_ids, namespace=index_name)
return format_ticket_context(fetched)
else:
return "No matching tickets found."
# Download GPT4All's Nous-Hermes2 model
local_path = "./models/ggml-gpt4all-l13b-snoozy.bin"
if not os.path.exists(local_path):
import requests
from pathlib import Path
from tqdm import tqdm
Path(local_path).parent.mkdir(parents=True, exist_ok=True)
url = 'http://gpt4all.io/models/ggml-gpt4all-l13b-snoozy.bin'
response = requests.get(url, stream=True)
with open(local_path, 'wb') as f:
for chunk in tqdm(response.iter_content(chunk_size=8192)):
if chunk:
f.write(chunk)
# Initialize LLM with GPT4All
callbacks = [StreamingStdOutCallbackHandler()]
llm = GPT4All(model=local_path, backend="gptj", callbacks=callbacks, verbose=True)
# Define PromptTemplate
template = """You are an AI specialized in providing insights based on ticket data. Below is the context from similar tickets: {context} Here's the question: {question} Please provide the best response:"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
index = pinecone.Index('index')
query_model = initialize_models()
question = "What happens in the ticket that mentions AutoZone?"
context = query_tickets(question, 'index')
final_response = llm_chain.run(context=context, question=question)
print(final_response)
I am not receiving any errors currently, but the code doesn’t perform as expected. Can anyone help me understand what might be wrong? Thank you!
I looked through the official Pinecone and Freshdesk documentation to find relevant examples and guides for upserting and querying the data. Additionally, I consulted tutorials and online forums that provided insights into similar tasks. I even sought assistance from GPT-4 to generate code snippets and get an understanding of potential solutions. I expected that by following these resources, I would be able to successfully upsert Freshdesk ticket data into Pinecone and then query that data according to my requirements. However, despite trying these multiple approaches, the code didn’t perform as expected, and I am still struggling to achieve the desired result.