So you need a Dictionary for each chunk and append them to a list. So here is my working pdfLoader.py code with the getData.py
pip install pinecone-client
pip install openai
pip install langchain
pip install unstructured
pip install pytesseract
pip install tiktoken
pip install pikepdf
#I am using pikePDF to split the PDF into single pages and passing that name in the metadata to display as a link in the Answer output
#This does throw a dectron2 error but you can ignore it and it does not affect anything
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone,sys, os
from pikepdf import Pdf
OPENAI_API_KEY = ‘xxxxxxxxxx’
PINECONE_API_KEY = ‘xxxxxxxxx’
PINECONE_API_ENV = ‘us-east-1-aws’
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_API_ENV # next to api key in console
)
indexname = “Your PineCone Index Name”
#Filename must be full Path, not relative
filename = “C:/FileUploads/MyReport.pdf”
#filename = sys.argv[1] if you want to call the py file and pass a parameter
pdf = Pdf.open(filename)
#Split off base filename with out the .pdf so I can append 001.pdf to each page
name, ext = os.path.splitext(filename)
#Set Master filename for link in Answer, passed in Metadatas
masterfilename = os.path.basename(filename).split(‘/’)[-1]
#print(masterfilename)
#Loop though each page
for pnum, page in enumerate(pdf.pages):
#Split PDF into pages
dst = Pdf.new()
dst.pages.append(page)
pagenum = pnum + 1
#Append 3 padded Number to name starting with 001
pagefilenamepath = name + f’{pagenum:03d}.pdf’
dst.save(pagefilenamepath)
#Set PageFilename to pass in Metadata
pagefilename = os.path.basename(pagefilenamepath).split(‘/’)[-1]
#Read this page
loader = UnstructuredFileLoader(pagefilenamepath)
data = loader.load()
#Chunk size can be 500 to 5000+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
#Create List of Dictionary for each chunk for this page
metalist = []
metadict= {}
#Loop through Chunks to Create a Dictionary and append them to the list because metadatas=metalist is looking for a List(Dict())
for t in range(len(texts)):
metadict["masterfilename"] = masterfilename
metadict["pagefilename"] = pagefilename
metadict["pdfpagenum"] = pagenum
metalist.append(metadict)
#print(metalist)
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, metadatas=metalist, index_name=indexname)
print(pagefilename+" Processed")
#Still in Page Loop
#End of this routine
This is the retrieval code
import pinecone
import openai
import sys
from urllib.parse import unquote
#Passing an argument/Prompt for the call
ARG = sys.argv[1]
RET = unquote(ARG)
#RET = “Ask a test question here?”
openai.api_key = ‘xxxxxxxxxx’
PINECONE_API_KEY = ‘xxxxxxxxxx’
PINECONE_API_ENV = ‘us-east-1-aws’
initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_API_ENV # next to api key in console
)
connect to index
index = pinecone.Index(‘csr’)
view index stats
print(index.describe_index_stats())
embed_model = “text-embedding-ada-002”
query = RET
res = openai.Embedding.create(
input=[query],
model=embed_model
)
limit the size of the context to add to the prompt
limit = 3750
retrieve from Pinecone
xq = res[‘data’][0][‘embedding’]
get relevant contexts
res = index.query(xq, top_k=2, include_metadata=True)
#print(res)
contexts = [ x[‘metadata’][‘text’] for x in res[‘matches’] ]
metadata = [ x[‘metadata’][‘masterfilename’] + ‘;’ + x[‘metadata’][‘pagefilename’] + ‘;’ + str(int(x[‘metadata’][‘pdfpagenum’])) for x in res[‘matches’] ]
#This spits out multiple lists. comma delimited, “;” delimited
print(contexts)
print(‘|’)
print(metadata)
#Take this and send it to ChatGPT3.5 for prose