I am building a Q&A service to help Adults who have questions about their aging parents. I pulled some data in from Reddit, cleaned it and put it in a CSV file that you can view here. After converting my data into embeddings I try to upload to pinecone but receive an error.
You can view the traceback here:
Traceback (most recent call last):
File "/Users/luseniikromah/Developer/helpingyoungadults/cluster_aging_data.py", line 76, in <module>
index.upsert(vectors=to_upsert)
File "/usr/local/lib/python3.9/site-packages/pinecone/core/utils/error_handling.py", line 17, in inner_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/pinecone/index.py", line 147, in upsert
return self._upsert_batch(vectors, namespace, _check_type, **kwargs)
File "/usr/local/lib/python3.9/site-packages/pinecone/index.py", line 231, in _upsert_batch
return self._vector_api.upsert(
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/api_client.py", line 776, in __call__
return self.callable(self, *args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/api/vector_operations_api.py", line 956, in __upsert
return self.call_with_http_info(**kwargs)
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/api_client.py", line 838, in call_with_http_info
return self.api_client.call_api(
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/api_client.py", line 413, in call_api
return self.__call_api(resource_path, method,
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/api_client.py", line 207, in __call_api
raise e
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/api_client.py", line 200, in __call_api
response_data = self.request(
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/api_client.py", line 459, in request
return self.rest_client.POST(url,
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/rest.py", line 271, in POST
return self.request("POST", url,
File "/usr/local/lib/python3.9/site-packages/pinecone/core/client/rest.py", line 230, in request
raise ApiException(http_resp=r)
pinecone.core.client.exceptions.ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'date': 'Tue, 28 Feb 2023 19:56:35 GMT', 'x-envoy-upstream-service-time': '2', 'content-length': '60', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"not supported value type","details":[]}
So far I have:
- removed all non alphanumeric data from dataset
- made sure all vectors in batch loop are non empty
- checked type of all batched vectors
Still no luck. I have attached my code below for further review.
import openai
import pandas as pd
import pinecone
from sys import getsizeof
import json
from datasets import load_dataset
from tqdm import tqdm
# set up OpenAI API credentials
openai.api_key = 'sk-API'
openai.organization = 'org-API'
# read the CSV file into a Pandas dataframe
df = pd.read_csv('aging.csv', encoding='utf-8')
data = load_dataset("csv",data_files='aging.csv', split='train')
pinecone.init(api_key='API', environment='us-east1-gcp')
text = [
f"Thread Title: {x['Title']}\n\n"+
f"Question Asked: {x['Description']}\n\n"+
f"Answer: {x['Top Comments']}\n\n" for x in data
]
MODEL = 'text-embedding-ada-002'
res = openai.Embedding.create(input=text, engine=MODEL)
# print(res)
# print(len(res['data'][0]['embedding']))
if 'agingparents' not in pinecone.list_indexes():
pinecone.create_index('agingparents',
dimension=len(res['data'][0]['embedding']),
metric='cosine',
metadata_config={'indexed':['docs']}
)
index = pinecone.Index('agingparents')
print(data[0])
# creating a vector embedding for each sample in batches of 32
batch_size = 32
for i in tqdm(range(0,len(text), batch_size)):
i_end = min(i+batch_size, len(text))
# get batch of 32 lines and ids
text_batch = text[i:i_end]
# actual phrases being attached to the end of each vector
meta_batch = [data[x] for x in range(i,i_end)]
# random ass ids
ids_batch = [str(n) for n in range(i,i_end)]
# create embeddings
res = openai.Embedding.create(input=text_batch, engine=MODEL)
embeds = [record['embedding'] for record in res['data']]
to_upsert = list(zip(ids_batch, embeds, meta_batch))
# upsert to Pinecone
index.upsert(vectors=to_upsert)
print(index.describe_index_stats())```