I’m creating an index like this:
def create_and_populate_index(index_name, corpus_path,
transformer_name='sentence-transformers/paraphrase-MiniLM-L6-v2',
transformer_dim=384):
if index_name not in pinecone.list_indexes():
pinecone.create_index(name=index_name, metric="cosine", dimension=transformer_dim)
index = pinecone.Index(index_name)
content, sources = load_corpus(corpus_path)
embedder = SentenceTransformer(transformer_name)
embeddings = embedder.encode(content)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
batch_size = 32
for i in range(0, len(content), batch_size):
print(i)
batch = []
meta = {}
for j in range(i, i + batch_size if i + batch_size < len(content) else len(content)):
meta['id'] = str(j)
meta["metadata"] = {}
meta["metadata"]["text"] = content[j]
meta["metadata"]["source"] = sources[j]
meta['values'] = embeddings[j , :].tolist()
batch.append(meta)
print(batch)
index.upsert(vectors=batch)
return index
Next I send a query:
embedder = SentenceTransformer(TRANSFORMER_NAME)
query = "what should I eat for breakfast?"
query_embedding = embedder.encode([query])[0]
query_embedding /= np.linalg.norm(query_embedding)
results = index.query(query_embedding.tolist(), top_k=2, include_metadata=True)
The results are bad:
{'matches': [{'id': '95',
'metadata': {'source': 'www.nhs.uk/live-well/eat-well/how-to-eat-a-balanced-diet/the-vegetarian-diet/:',
'text': "You don't need to achieve this balance "
'with every meal, but try to get the '
'balance right over a day, or even a week. '
'Choose options low in fat, salt and sugar '
'whenever you can.'},
'score': 0.512832582,
'values': []},
{'id': '351',
'metadata': {'source': 'www.nhs.uk/live-well/eat-well/food-guidelines-and-food-labels/the-eatwell-guide/',
'text': 'These foods include chocolate, cakes, '
'biscuits, sugary soft drinks, butter, ghee '
'and ice cream.'},
'score': 0.455812603,
'values': []},
But if I create my own index:
class DocumentIndex:
def __init__(self, content, sources, max_length=1024, min_length=10):
self.embedder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
self.summaries = [c for c in content]
self.sources = [s for s in sources]
embeddings_full = torch.Tensor(self.embedder.encode(self.summaries))
self.embeddings = embeddings_full / embeddings_full.norm(dim=1, keepdim=True)
def _embed(self, text):
embed_full = torch.Tensor(self.embedder.encode([text])[0])
return embed_full / embed_full.norm()
def search(self, query, k=5):
query_embedding = self._embed(query)
distances = (query_embedding @ self.embeddings.T).squeeze()
inds = torch.argsort(distances, descending=True)
return distances[inds[:k]], [self.summaries[i] for i in inds[:k]], [self.sources[i] for i in inds[:k]]
Then run:
index = DocumentIndex(*load_corpus("../../nutrition/src/nhs_corpus.txt"))
distances, content, _ = index.search(query, k=2)
for i in range(2):
print(distances[i], content[i])
print("---")
I get much better results:
tensor(0.6537) Breakfast: instead of a full English breakfast, go for a poached egg on toast with mushrooms and grilled tomatoes. If you do have meat, have either bacon or a sausage, but not both.
---
tensor(0.6187) You could also swap your mid-morning biscuit for a banana, and add a side salad to your lunch.
---
Am I doing something wrong? Is this expected?