My Pinecone query simply won't return any match (Using Langchain + NodeJS)

valmirgmj · December 22, 2024, 6:17pm

I’m using a Node script to load a vectored txt file into a Pinecone index and then trying to query something about the loaded text. My index has the following characteristics:

metric: cosine
dimension: 1024
deploy: AWS

The file is being vectored and embedded using the model ‘multilingual-e5-large’ as following:

import { Pinecone } from "@pinecone-database/pinecone";
import { Document } from "langchain/document";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

export default async (client: Pinecone, indexName: string, docs: Document<Record<string, any>>[]) => {
  console.log("Retrieving Pinecone index...");

  const index = client.Index(indexName);

  console.log(`Pinecone index retrieved: ${indexName}`);

  for (const doc of docs) {
    console.log(`Processing document: ${doc.metadata.source}`);
    const txtPath = doc.metadata.source;
    const text = doc.Preformatted textpageContent;

    const textSplitter = new RecursiveCharacterTextSplitter({
      chunkSize: 800,
    });

    const chunks = await textSplitter.createDocuments([text]);
    console.log(`Text split into ${chunks.length} chunks`);
    console.log(
      `Calling OpenAI's Embedding endpoint documents with ${chunks.length} text chunks ...`
    );

    const embeddings = await client.inference.embed(
      'multilingual-e5-large',
      chunks.map((chunk) => chunk.pageContent),
      { inputType: 'passage', truncate: 'END' }
    );

    console.log("Finished embedding documents");
    console.log(
      `Creating ${chunks.length} vectors array with id, values, and metadata...`
    );

    const batchSize = 100;
    let batch = [];
    for (let i = 0; i < chunks.length; i++) {
      const chunk = chunks[i];
      const vector = {
        id: `${txtPath}_${i}`,
        values: embeddings[i].values,
        metadata: {
          ...chunk.metadata,
          loc: JSON.stringify(chunk.metadata.loc),
          pageContent: chunk.pageContent,
          txtPath: txtPath,
        },
      };

      batch.push(vector);

      if (batch.length === batchSize || i === chunks.length - 1) {
        await index.upsert(batch);

        batch = [];
      }
    }

    console.log(`Pinecone index updated with ${chunks.length} vectors`);
  }
};

And then I’m querying Pinecone like that:

import { Pinecone } from "@pinecone-database/pinecone";

export default async (
  client: Pinecone,
  indexName: string,
  query: string
) => {
  console.log("Querying Pinecone vector store...");

  const index = client.Index(indexName);

  const embeddings = await client.inference.embed(
    'multilingual-e5-large',
    [query],
    { inputType: 'query' }
  );

  const queryResponse = await index.query({
    topK: 10,
    vector: embeddings[0].values,
    includeMetadata: true,
    includeValues: true,
  });

  console.log(`Found ${queryResponse.matches.length} matches...`);

  let concatenatedQueryResponse = "";

  if (queryResponse.matches.length > 0) {
    concatenatedQueryResponse = queryResponse.matches.map((match) => match.metadata.pageContent).join("\n");
  }
};

The text file being loaded is just lil history (created by GPT) and the query is about the name of characters, but the result is always 0 matches. Am I maybe missing something?