def get_embedding(text, model=“text-embedding-ada-002”):
text = text.replace(“\n”, " “)
try:
result = openai.Embedding.create(input=[text], model=model)
if ‘data’ in result and result[‘data’] and ‘embedding’ in result[‘data’][0]:
return result[‘data’][0][‘embedding’]
else:
print(f"Error: Invalid embedding result for text: {text}”)
return None
except Exception as e:
print(f"Error: {e} occurred while creating embedding for text: {text}")
return None
import ast
Generate embeddings
df_new[‘ada_embedding’] = df_new[‘ada_embedding’].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
Filter out rows with None values in the ada_embedding column
#df_new = df_new[df_new[‘ada_embedding’].notna()]
Ensure embeddings are in the correct format (e.g., lists or arrays of numerical values)
df_new[‘ada_embedding’] = df_new[‘ada_embedding’].apply(lambda x: list(x) if isinstance(x, (list, np.ndarray)) else None)
Create an index
index = pinecone.Index(“apex-reference”)
from tqdm.auto import tqdm
import numpy as np
import json
def to_vectors(embeddings):
vectors =
for embedding in embeddings:
try:
if isinstance(embedding, str):
vector = np.array(ast.literal_eval(embedding))
elif isinstance(embedding, list):
vector = np.array(embedding)
else:
raise TypeError(“Invalid input type”)
vectors.append(vector)
except (ValueError, TypeError, SyntaxError):
print(f"Could not convert embedding {embedding} to vector")
continue
return vectors
we will use batches of 5
batch_size = 1
for i in tqdm(range(0, len(df_new), batch_size)):
# Find end of batch
i_end = min(i + batch_size, len(df_new))
# Extract batch
batch = df_new.iloc[i:i_end]
# Generate embeddings for batch (ensure embeddings are in the correct format)
emb = to_vectors(batch[‘ada_embedding’])
# Get metadata (optional)
meta = batch[[‘PageNumber’, ‘PageContent’]].to_dict(orient=‘records’)
# Create unique IDs
ids = [f"{idx}" for idx in range(i, i_end)]
# Create a dictionary for upsert
to_upsert = dict(zip(ids, emb))
# Upsert/insert these records to Pinecone
if not to_upsert:
print(“Empty batch, skipping…”)
continue
try:
_ = index.upsert(vectors=to_upsert, metadata=meta)
except Exception as e:
print(f"Error: {e} occurred during upsert, skipping batch…")
continue
check that we have all vectors in index
index.describe_index_stats()
Error: Error: Invalid vector value passed: cannot interpret type <class ‘str’> occurred during upsert, skipping batch…