Hi, this is the code I use to try to upload the public dataset “amazon-toys-quora-all-minilm-l6-bm25”:
amz_ds.to_pinecone_index(
index_name="amazon-toys-quora-all-minilm-l6-bm25",
should_create_index=True,
serverless=True,
cloud="aws",
region="us-west-2",
)
I get this error:
{
"name": "PineconeApiException",
"message": "(400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 15 May 2024 19:16:52 GMT', 'Content-Type': 'application/json', 'Content-Length': '92', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '341', 'x-pinecone-request-id': '4313797377394377488', 'x-envoy-upstream-service-time': '6', 'server': 'envoy'})
HTTP response body: {\"code\":3,\"message\":\"Sparse vector size 2211 exceeds the maximum size of 1000\",\"details\":[]}
",
"stack": "---------------------------------------------------------------------------
PineconeApiException Traceback (most recent call last)
Cell In[32], line 1
----> 1 amz_ds.to_pinecone_index(
2 index_name=\"amazon-toys-quora-all-minilm-l6-bm25\",
3 should_create_index=True,
4 serverless=True,
5 cloud=\"aws\",
6 region=\"us-west-2\",
7 )
File ~/miniforge3/lib/python3.10/site-packages/pinecone_datasets/dataset.py:569, in Dataset.to_pinecone_index(self, index_name, namespace, should_create_index, batch_size, show_progress, api_key, environment, region, cloud, serverless, **kwargs)
566 else:
567 self._set_pinecone_index(api_key=api_key, **kwargs)
--> 569 return self._upsert_to_index(
570 index_name=index_name,
571 namespace=namespace,
572 batch_size=batch_size,
573 show_progress=show_progress,
574 )
File ~/miniforge3/lib/python3.10/site-packages/pinecone_datasets/dataset.py:444, in Dataset._upsert_to_index(self, index_name, namespace, batch_size, show_progress)
439 def _upsert_to_index(
440 self, index_name: str, namespace: str, batch_size: int, show_progress: bool
441 ):
442 pinecone_index = self._pinecone_client.Index(index_name)
--> 444 res = pinecone_index.upsert_from_dataframe(
445 self.documents[self._config.Schema.documents_select_columns].dropna(
446 axis=1, how=\"all\"
447 ),
448 namespace=namespace,
449 batch_size=batch_size,
450 show_progress=show_progress,
451 )
452 return {\"upserted_count\": res.upserted_count}
File ~/miniforge3/lib/python3.10/site-packages/pinecone/data/index.py:229, in Index.upsert_from_dataframe(self, df, namespace, batch_size, show_progress)
227 results = []
228 for chunk in self._iter_dataframe(df, batch_size=batch_size):
--> 229 res = self.upsert(vectors=chunk, namespace=namespace)
230 pbar.update(len(chunk))
231 results.append(res)
File ~/miniforge3/lib/python3.10/site-packages/pinecone/utils/error_handling.py:10, in validate_and_convert_errors.<locals>.inner_func(*args, **kwargs)
7 @wraps(func)
8 def inner_func(*args, **kwargs):
9 try:
---> 10 return func(*args, **kwargs)
11 except MaxRetryError as e:
12 if isinstance(e.reason, ProtocolError):
File ~/miniforge3/lib/python3.10/site-packages/pinecone/data/index.py:168, in Index.upsert(self, vectors, namespace, batch_size, show_progress, **kwargs)
161 raise ValueError(
162 \"async_req is not supported when batch_size is provided.\"
163 \"To upsert in parallel, please follow: \"
164 \"https://docs.pinecone.io/docs/insert-data#sending-upserts-in-parallel\"
165 )
167 if batch_size is None:
--> 168 return self._upsert_batch(vectors, namespace, _check_type, **kwargs)
170 if not isinstance(batch_size, int) or batch_size <= 0:
171 raise ValueError(\"batch_size must be a positive integer\")
File ~/miniforge3/lib/python3.10/site-packages/pinecone/data/index.py:189, in Index._upsert_batch(self, vectors, namespace, _check_type, **kwargs)
186 args_dict = self._parse_non_empty_args([(\"namespace\", namespace)])
187 vec_builder = lambda v: VectorFactory.build(v, check_type=_check_type)
--> 189 return self._vector_api.upsert(
190 UpsertRequest(
191 vectors=list(map(vec_builder, vectors)),
192 **args_dict,
193 _check_type=_check_type,
194 **{k: v for k, v in kwargs.items() if k not in _OPENAPI_ENDPOINT_PARAMS},
195 ),
196 **{k: v for k, v in kwargs.items() if k in _OPENAPI_ENDPOINT_PARAMS},
197 )
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/api_client.py:772, in Endpoint.__call__(self, *args, **kwargs)
761 def __call__(self, *args, **kwargs):
762 \"\"\" This method is invoked when endpoints are called
763 Example:
764
(...)
770
771 \"\"\"
--> 772 return self.callable(self, *args, **kwargs)
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/api/data_plane_api.py:1084, in DataPlaneApi.__init__.<locals>.__upsert(self, upsert_request, **kwargs)
1081 kwargs['_host_index'] = kwargs.get('_host_index')
1082 kwargs['upsert_request'] = \\
1083 upsert_request
-> 1084 return self.call_with_http_info(**kwargs)
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/api_client.py:834, in Endpoint.call_with_http_info(self, **kwargs)
830 header_list = self.api_client.select_header_content_type(
831 content_type_headers_list)
832 params['header']['Content-Type'] = header_list
--> 834 return self.api_client.call_api(
835 self.settings['endpoint_path'], self.settings['http_method'],
836 params['path'],
837 params['query'],
838 params['header'],
839 body=params['body'],
840 post_params=params['form'],
841 files=params['file'],
842 response_type=self.settings['response_type'],
843 auth_settings=self.settings['auth'],
844 async_req=kwargs['async_req'],
845 _check_type=kwargs['_check_return_type'],
846 _return_http_data_only=kwargs['_return_http_data_only'],
847 _preload_content=kwargs['_preload_content'],
848 _request_timeout=kwargs['_request_timeout'],
849 _host=_host,
850 collection_formats=params['collection_format'])
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/api_client.py:409, in ApiClient.call_api(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, async_req, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)
355 \"\"\"Makes the HTTP request (synchronous) and returns deserialized data.
356
357 To make an async_req request, set the async_req parameter.
(...)
406 then the method will return the response directly.
407 \"\"\"
408 if not async_req:
--> 409 return self.__call_api(resource_path, method,
410 path_params, query_params, header_params,
411 body, post_params, files,
412 response_type, auth_settings,
413 _return_http_data_only, collection_formats,
414 _preload_content, _request_timeout, _host,
415 _check_type)
417 return self.pool.apply_async(self.__call_api, (resource_path,
418 method, path_params,
419 query_params,
(...)
427 _request_timeout,
428 _host, _check_type))
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/api_client.py:203, in ApiClient.__call_api(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)
201 except PineconeApiException as e:
202 e.body = e.body.decode('utf-8')
--> 203 raise e
205 self.last_response = response_data
207 return_data = response_data
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/api_client.py:196, in ApiClient.__call_api(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)
192 url = _host + resource_path
194 try:
195 # perform request and return response
--> 196 response_data = self.request(
197 method, url, query_params=query_params, headers=header_params,
198 post_params=post_params, body=body,
199 _preload_content=_preload_content,
200 _request_timeout=_request_timeout)
201 except PineconeApiException as e:
202 e.body = e.body.decode('utf-8')
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/api_client.py:455, in ApiClient.request(self, method, url, query_params, headers, post_params, body, _preload_content, _request_timeout)
447 return self.rest_client.OPTIONS(url,
448 query_params=query_params,
449 headers=headers,
(...)
452 _request_timeout=_request_timeout,
453 body=body)
454 elif method == \"POST\":
--> 455 return self.rest_client.POST(url,
456 query_params=query_params,
457 headers=headers,
458 post_params=post_params,
459 _preload_content=_preload_content,
460 _request_timeout=_request_timeout,
461 body=body)
462 elif method == \"PUT\":
463 return self.rest_client.PUT(url,
464 query_params=query_params,
465 headers=headers,
(...)
468 _request_timeout=_request_timeout,
469 body=body)
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/rest.py:302, in RESTClientObject.POST(self, url, headers, query_params, post_params, body, _preload_content, _request_timeout)
300 def POST(self, url, headers=None, query_params=None, post_params=None,
301 body=None, _preload_content=True, _request_timeout=None):
--> 302 return self.request(\"POST\", url,
303 headers=headers,
304 query_params=query_params,
305 post_params=post_params,
306 _preload_content=_preload_content,
307 _request_timeout=_request_timeout,
308 body=body)
File ~/miniforge3/lib/python3.10/site-packages/pinecone/core/client/rest.py:261, in RESTClientObject.request(self, method, url, query_params, headers, body, post_params, _preload_content, _request_timeout)
258 if 500 <= r.status <= 599:
259 raise ServiceException(http_resp=r)
--> 261 raise PineconeApiException(http_resp=r)
263 return r
PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 15 May 2024 19:16:52 GMT', 'Content-Type': 'application/json', 'Content-Length': '92', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '341', 'x-pinecone-request-id': '4313797377394377488', 'x-envoy-upstream-service-time': '6', 'server': 'envoy'})
HTTP response body: {\"code\":3,\"message\":\"Sparse vector size 2211 exceeds the maximum size of 1000\",\"details\":[]}
"
}
Is this limit fixed? Is it documented somewhere? How can I work around it?
Thanks