I want to add the filename to the metadata using Pinecone.from_texts

Every thing is loading fine with the PDF and the search comes back fine. I want to add the filename to the search results because I am loading multiple PDFs. Where do I add the filename so it is returned in the search? Do I add it in Pinecone.from_texts or inject it in with the text splitter?

from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone,sys

loader = UnstructuredFileLoader("EVMS.pdf)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(data)

OPENAI_API_KEY = ‘My OPENAI_API_KEY’
PINECONE_API_KEY = ‘My PINECONE_API_KEY’
PINECONE_API_ENV = ‘us-east-1-aws’

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

pinecone.init(
api_key=PINECONE_API_KEY,
environment=PINECONE_API_ENV
)

indexname = “MyIndex”
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=indexname)

The class from pinecone.py lists these
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
batch_size: int = 32,
text_key: str = “text”,
index_name: Optional[str] = None,
namespace: Optional[str] = None,
**kwargs: Any,
)

Thanks!

you can inject the pdf file name as a metadata field while using Pinecone.from_texts.
Pinecone.from_texts allows for a list object to be passed in the metadatas parameters so you need to create a list of dictionaries that corresponds to your texts for example:

texts=['foo','bar','baz','qux','bar','quux']
meta = [{'filename':'pdf1'},{'filename':'pdf2'},{'filename':'pdf3'},{'filename':'pdf2'},{'filename':'pdf4'},]

and then add this list to the same line as

docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, metadatas=meta, index_name=indexname)

this will return the pdf name of the search results in the metatdata field of the response object.

So you need a Dictionary for each chunk and append them to a list. So here is my working pdfLoader.py code with the getData.py

pip install pinecone-client

pip install openai

pip install langchain

pip install unstructured

pip install pytesseract

pip install tiktoken

pip install pikepdf

#I am using pikePDF to split the PDF into single pages and passing that name in the metadata to display as a link in the Answer output
#This does throw a dectron2 error but you can ignore it and it does not affect anything

from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone,sys, os
from pikepdf import Pdf

OPENAI_API_KEY = ‘xxxxxxxxxx’

PINECONE_API_KEY = ‘xxxxxxxxx’
PINECONE_API_ENV = ‘us-east-1-aws’

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

initialize pinecone

pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_API_ENV # next to api key in console
)
indexname = “Your PineCone Index Name”

#Filename must be full Path, not relative
filename = “C:/FileUploads/MyReport.pdf”
#filename = sys.argv[1] if you want to call the py file and pass a parameter

pdf = Pdf.open(filename)

#Split off base filename with out the .pdf so I can append 001.pdf to each page
name, ext = os.path.splitext(filename)

#Set Master filename for link in Answer, passed in Metadatas
masterfilename = os.path.basename(filename).split(‘/’)[-1]

#print(masterfilename)

#Loop though each page
for pnum, page in enumerate(pdf.pages):
#Split PDF into pages
dst = Pdf.new()
dst.pages.append(page)
pagenum = pnum + 1
#Append 3 padded Number to name starting with 001
pagefilenamepath = name + f’{pagenum:03d}.pdf’
dst.save(pagefilenamepath)
#Set PageFilename to pass in Metadata
pagefilename = os.path.basename(pagefilenamepath).split(‘/’)[-1]

#Read this page
loader = UnstructuredFileLoader(pagefilenamepath)
data = loader.load()
#Chunk size can be 500 to 5000+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

#Create List of Dictionary for each chunk for this page
metalist = []
metadict= {}

#Loop through Chunks to Create a Dictionary and append them to the list  because   metadatas=metalist is looking for a List(Dict())
for t in range(len(texts)):
    metadict["masterfilename"] = masterfilename
    metadict["pagefilename"] = pagefilename
    metadict["pdfpagenum"] = pagenum
    metalist.append(metadict)
#print(metalist)

docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, metadatas=metalist, index_name=indexname)

print(pagefilename+" Processed")
#Still in Page Loop

#End of this routine

This is the retrieval code

import pinecone
import openai
import sys
from urllib.parse import unquote

#Passing an argument/Prompt for the call
ARG = sys.argv[1]
RET = unquote(ARG)
#RET = “Ask a test question here?”

openai.api_key = ‘xxxxxxxxxx’
PINECONE_API_KEY = ‘xxxxxxxxxx’
PINECONE_API_ENV = ‘us-east-1-aws’

initialize pinecone

pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_API_ENV # next to api key in console
)

connect to index

index = pinecone.Index(‘csr’)

view index stats

print(index.describe_index_stats())

embed_model = “text-embedding-ada-002”
query = RET

res = openai.Embedding.create(
input=[query],
model=embed_model
)

limit the size of the context to add to the prompt

limit = 3750

retrieve from Pinecone

xq = res[‘data’][0][‘embedding’]

get relevant contexts

res = index.query(xq, top_k=2, include_metadata=True)
#print(res)

contexts = [ x[‘metadata’][‘text’] for x in res[‘matches’] ]
metadata = [ x[‘metadata’][‘masterfilename’] + ‘;’ + x[‘metadata’][‘pagefilename’] + ‘;’ + str(int(x[‘metadata’][‘pdfpagenum’])) for x in res[‘matches’] ]

#This spits out multiple lists. comma delimited, “;” delimited
print(contexts)
print(‘|’)
print(metadata)
#Take this and send it to ChatGPT3.5 for prose

Not sure why you need to write a custom code here while you get file name metadata by default like this:

metadata={‘source’: ‘C:\Dir\FileName.pdf’}