Hi, I am sorry I have a follow on query. I can now detect if the document exist so can avoid adding the documents to the index again. However, I am struggling to then use the returned index object to query.
Here is the code that returns the index object if a document is already found. Seems to return a sensible response and an index object.
# Checks to see if the document is already in the PINECONE index and returns this is it is.
#@st.cache(suppress_st_warning=True)
def pinecone_document_exists(guid,index_name,namespace):
# initialize pinecone
#pinecone.init(
# api_key=PINECONE_API_KEY, # find at app.pinecone.io
# environment=PINECONE_API_ENV # next to api key in console
#)
index = pinecone.Index(index_name)
#st.info(index)
# Query for documents with metadata value "blue"
query = {"guid": guid}
#st.info(f"filtering by: {query}")
# Need to pass also the vector , but this can be just the embedding dimension
try:
query_response = index.query(
vector=[0] * 1536,
filter=query,
top_k=1,
include_metadata=True,
namespace=namespace,
)
except:
# If the index is empty it has no name space so the query will fail first time around so return false
return False
st.info(query_response)
matches_list = query_response['matches'] # Extracting the "matches" list
if len(matches_list) == 0:
st.info("The document was not found in PINECONE")
return False
else:
st.info(f"The document was found in PINECONE {matches_list}")
return index
However, when I try to use this index and do a similarity search in langchain. I get the following error. The query code runs fine in the cases I add the documents to PINECONE. This is the error I get.
AttributeError: 'Index' object has no attribute 'similarity_search'
Traceback:
File "C:\Users\jonathan.sutcliffe\Anaconda3\lib\site-packages\streamlit\runtime\scriptrunner\script_runner.py", line 556, in _run_script
exec(code, module.__dict__)
File "C:\Users\jonathan.sutcliffe\app.py", line 595, in <module>
start_chat(docsearch,namespace,guid)
File "C:\Users\jonathan.sutcliffe\app.py", line 424, in start_chat
docs = docsearch.similarity_search(query,include_metadata=True, namespace=namespace,k=4, filter=metadata)
This is the search code that executes. As I say it works if I upsert, but not when I try to reuse the index.
docs = docsearch.similarity_search(query,include_metadata=True, namespace=namespace,k=4, filter=metadata)
for qry in query_list:
st.warning(f"Question: {qry}")
# Find the matching documents for the qry
message_response = chain.run(input_documents=docs, question=qry,verbose=True)
st.success(message_response)
st.info("Got to end of function - chat")
where docsearch is the index object returned from the 1st function.
Below is the actual code that does an upsert that after this I can successfully query.
# If the document is not in pinecone then we need to chunk it and then upsert the chunks into the pinecone index
if docsearch == False:
# upload the document to pinecone and create the embeddings
# We didn't find the document so now we need to create chunks and all the embeddings (vectors) and store these in PINECONE
st.warning("Didn't find the document so processing and adding to DB")
with st.spinner("Chunking, Creating embeddings and storing in Pinecone - will take a few minutes, normally done already in the data injestion stage"):
# The files are in the temporary location: fpath = r"C:\Users\jonathan.sutcliffe\AppData\Local\Temp\*.pdf"
# LOAD THE TEMPORARY FILE
loader = UnstructuredPDFLoader(str(tfp))
data = loader.load()
character_count = str(len(data[0].page_content))
print (character_count)
if data:
st.info (f"There are {character_count} total characters in your document")
else:
st.error ("Didn't find any content in the PDf document")
# CHUNK THE BIG FILE INTO SMALLER UNITS
# set the chunking parameters, seems like about 1000 tokens is the maximum with this model
chsize=2000 # In tokens - can't be too big
chover=30 # sliding window
# Now break the PDF file into the chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chsize, chunk_overlap=chover)
# Now we have the individual texts
texts = text_splitter.split_documents(data)
st.info (f'After chunking we now have {len(texts)} documents')
# UPLOAD ALL THE CHUNKS INTO THE VECTOR DATABASE
# Upload each page to Pinecone with the same metadata
for i, text in enumerate(texts):
# Create some metadata so we can identify this document only from the pinecone index/namespace
# Include the page number
metadata = {
"guid": guid, "chunk_number": i+1,"filename": uploaded_file.name
}
# Upsert the documents into Vector DB and return the object to the index
docsearch = Pinecone.from_texts(
[text.page_content], embeddings,
index_name=index_name, namespace=namespace,
metadatas=[metadata]
)
st.success(f'Uploaded sub-document chunk of vectors to PINECONE and added Metadata: {metadata}')
Sorry for the very long post - but I have been trying to fix this for a few hours now and I am going round in circles. Hopefully, its an easy spot for a better programmer than I.
Thanks in advance !
Jono