when i insert the embedding i see duplicate content even though i am checking for the duplicate content using LSH.
for split_page in split_pages:
print('split_page', split_page.text)
vector_id = shortuuid.uuid()
embedding = generate_embeddings(split_page.text)
m = MinHash(num_perm=128)
for d in embedding:
m.update(str(d).encode('utf8'))
if lsh.query(m):
print(f"Duplicated content detected: {split_page.text}")
else:
params = [
{
"id": vector_id,
"values": embedding,
"metadata": {
"content": split_page.text,
"Tone": ["Formal", "Legal", "General"],
"time_stamp": 0,
},
}
]
index.upsert(params, namespace=user_id)