Even ChaGPT is unable to get past these errors. what should be changed

VishwasAI · April 5, 2023, 5:29pm

def get_embedding(text, model=“text-embedding-ada-002”):
text = text.replace(“\n”, " “)
try:
result = openai.Embedding.create(input=[text], model=model)
if ‘data’ in result and result[‘data’] and ‘embedding’ in result[‘data’][0]:
return result[‘data’][0][‘embedding’]
else:
print(f"Error: Invalid embedding result for text: {text}”)
return None
except Exception as e:
print(f"Error: {e} occurred while creating embedding for text: {text}")
return None

import ast

Generate embeddings

df_new[‘ada_embedding’] = df_new[‘ada_embedding’].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

Filter out rows with None values in the ada_embedding column

#df_new = df_new[df_new[‘ada_embedding’].notna()]

Ensure embeddings are in the correct format (e.g., lists or arrays of numerical values)

df_new[‘ada_embedding’] = df_new[‘ada_embedding’].apply(lambda x: list(x) if isinstance(x, (list, np.ndarray)) else None)

Create an index

index = pinecone.Index(“apex-reference”)

from tqdm.auto import tqdm

import numpy as np
import json

def to_vectors(embeddings):
vectors =
for embedding in embeddings:
try:
if isinstance(embedding, str):
vector = np.array(ast.literal_eval(embedding))
elif isinstance(embedding, list):
vector = np.array(embedding)
else:
raise TypeError(“Invalid input type”)
vectors.append(vector)
except (ValueError, TypeError, SyntaxError):
print(f"Could not convert embedding {embedding} to vector")
continue
return vectors

we will use batches of 5

batch_size = 1

for i in tqdm(range(0, len(df_new), batch_size)):
# Find end of batch
i_end = min(i + batch_size, len(df_new))
# Extract batch
batch = df_new.iloc[i:i_end]
# Generate embeddings for batch (ensure embeddings are in the correct format)
emb = to_vectors(batch[‘ada_embedding’])
# Get metadata (optional)
meta = batch[[‘PageNumber’, ‘PageContent’]].to_dict(orient=‘records’)
# Create unique IDs
ids = [f"{idx}" for idx in range(i, i_end)]
# Create a dictionary for upsert
to_upsert = dict(zip(ids, emb))
# Upsert/insert these records to Pinecone
if not to_upsert:
print(“Empty batch, skipping…”)
continue
try:
_ = index.upsert(vectors=to_upsert, metadata=meta)
except Exception as e:
print(f"Error: {e} occurred during upsert, skipping batch…")
continue

check that we have all vectors in index

index.describe_index_stats()

Error: Error: Invalid vector value passed: cannot interpret type <class ‘str’> occurred during upsert, skipping batch…

VishwasAI · April 6, 2023, 3:51am

I tried this which works
_ = index.upsert(vectors=[
(f"Page_{i}", embeds[0],{})
])

Not sure whats issue with zip

VishwasAI · April 6, 2023, 3:54am

Revised code for reference. Note that to_upsert zip does not work but some low level format works. Can some expert from pinecone support please help? Is this bug?

we will use batches of 5

batch_size = 1

for i in tqdm(range(0, len(df_new), batch_size)):
# Find end of batch
i_end = min(i + batch_size, len(df_new))
# Extract batch
batch = df_new.iloc[i:i_end]
# Generate embeddings for batch (ensure embeddings are in the correct format)
res = openai.Embedding.create(input=list(batch[‘combined’]), engine=‘text-embedding-ada-002’)
embeds = [record[‘embedding’] for record in res[‘data’]]
# Get metadata (optional)
meta = batch[[‘PageNumber’, ‘PageContent’]].to_dict(orient=‘records’)
# Create unique IDs
ids = [f"Page_{idx}" for idx in range(i, i_end)]
# Create a dictionary for upsert
to_upsert = dict(zip(ids, embeds))
#print(to_upsert)
# Upsert/insert these records to Pinecone
if not to_upsert:
print(“Empty batch, skipping…”)
continue
try:
print(i,embeds)
_ = index.upsert(vectors=[
(f"Page_{i}“, embeds[0],{})
])
except Exception as e:
print(f"Error: {e} occurred during upsert, skipping batch…”)
continue

check that we have all vectors in index

index.describe_index_stats()

VishwasAI · April 6, 2023, 2:04pm

Ok so finally this worked. Hopefully it will save countless hours of efforts for someone else

index = pinecone.Index(“apex-reference”)

from tqdm.auto import tqdm

import numpy as np

we will use batches of 64

batch_size = 64

for i in tqdm(range(0, len(df_new), batch_size)):
# Find end of batch
i_end = min(i + batch_size, len(df_new))
# Extract batch
batch = df_new.iloc[i:i_end]
# Generate embeddings for batch (ensure embeddings are in the correct format)
res = openai.Embedding.create(input=list(batch[‘combined’]), engine=‘text-embedding-ada-002’)
embeds = [record[‘embedding’] for record in res[‘data’]]
# Get metadata (optional)
meta = batch[[‘PageNumber’, ‘PageContent’]].to_dict(orient=‘records’)
# Create unique IDs
ids = [f"Page_{idx}" for idx in range(i, i_end)]
# Create a dictionary for upsert
to_upsert = dict(zip(ids, embeds))
# Convert the to_upsert dictionary to the expected format for Pinecone
to_upsert_list = [(k, v, m) for k, v, m in zip(to_upsert.keys(), to_upsert.values(), meta)]
#print(to_upsert)
# Upsert/insert these records to Pinecone
if not to_upsert_list:
print(“Empty batch, skipping…”)
continue
try:
print(i, embeds)
_ = index.upsert(vectors=to_upsert_list)
except Exception as e:
print(f"Error: {e} occurred during upsert, skipping batch…")
continue