I am trying to fetch all the unique metadata values for a particular metadata field.
My issue is querying only returns 10k vectors at a time and I am not sure how to make sure that my next 10k query doesn’t overlap with the previous search results.
# Function to get unique values for a metadata field with pagination
def get_unique_metadata_values(filter_query, metadata_field):
# Initialize a set to store unique values
unique_values = set()
page_size = 1000 # Adjust the page size if needed
last_id = None
while True:
# Query Pinecone with the provided filter and pagination
query_results = index.query(
vector=[0] * 1536,
top_k=page_size, # Number of results per page
filter=filter_query,
include_metadata=True,
id_gt=last_id # Fetch results where the ID is greater than the last fetched ID
)
# If there are no more matches, break the loop
if len(query_results['matches']) == 0:
break
# Iterate through the results and collect unique values
for match in query_results['matches']:
metadata = match.get('metadata', {})
value = metadata.get(metadata_field)
if value is not None and value not in unique_values:
unique_values.add(value)
print("Adding", value)
# Update the last_id with the current match ID
last_id = match['id']
return unique_values