Urgent help needed for Pinecone metadata filter issue in chatbot using OpenAI Langchain Streamlit;

Hello Pinecone Community,
I hope this message finds you well. I am reaching out for your help in resolving a issue that I have been struggling with for days. I have a chatbot that’s currently functioning and I have created Ui on Streamlit to load the vectors and metadata, but I want to filter the results by a metadata field ‘pdf_id’ that I added. I am using the

  • OpenAI
  • Langchain
  • Streamlit solution for this project.

The issue I’m encountering is that I can’t get the metadata filter to work. My chatbot script calls functions from a utils script.
When I run this script in Visual Studio Code, I keep getting an error:
“Import ‘pinecone’ could not be resolved PylancereportMissingImports.”
However, this issue doesn’t prevent the script from running when I use the functions in the 1) Functions work but do not filter by metadata section below. Now, I want to replace these functions so I can filter metadata using the following code in my chatbot script:

        filter = {"pdf_id": {"$eq": "fd31d0e8"}}
        response = get_answer(query, filter=filter) # Replace

I have tried to use metadata filtering documents on pinecone, and countless other solutions, but nothing seems to work. You can see one of the many attempts in the 2) Functions do not work with metadata filters section below.
I would really appreciate your assistance. I am at my wit’s end with this problem and would be grateful for any advice or help you can provide.
Thank you in advance for your support!
Here is the utlis script:

import os
from dotenv import load_dotenv
load_dotenv("API.env")

import langchain
import openai
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI as ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

# Define Pinecone API key and environment
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")

# Initialize Pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)

index_name = "langchain2"

# Embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)

# Pinecone index
docs=[]
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

# Load the LLM and QA Chain
llm = ChatOpenAI(temperature=0)
chain = load_qa_chain(llm, chain_type="stuff")

def get_initial_message():
    messages = [
        {"role": "system", "content": "You are a helpful assistant that supports users who are international development practitioners. If you do not know the answer to any question, you will seek further clarification from the user"},
        {"role": "user", "content": "I need you to help me research some documents in your knowledgebase"},
        {"role": "assistant", "content": "Thats awesome, what do you want to know?"}
    ]
    return messages
################################################################
# **1) Functions work but do not filter by metadata** 

def get_similiar_docs(query, k=2, score=False):
    if score:
        similar_docs = index.similarity_search_with_score(query, k=k)
    else:
        similar_docs = index.similarity_search(query, k=k)
    return similar_docs

def get_answer(prompt):
    similar_docs = get_similiar_docs(prompt)
    answer = chain.run(input_documents=similar_docs, question=prompt)
    return answer

# **2) Functions do not work with metadata filters** 
# **AttributeError:** 'Pinecone' object has no attribute 'query'

def get_similar_docs(query_vector, index, k=5, filter=None):
    similar_docs = index.query(queries=[query_vector], top_k=k, filter=filter)
    return similar_docs

def get_answer(prompt, filter=None):
    similar_docs = get_similar_docs(prompt, index, k=5, filter=filter)
    answer = chain.run(input_documents=similar_docs, question=prompt)
    return answer
#####################################################################


def get_chatgpt_response(messages, model="gpt-3.5-turbo"):
    print("model: ", model)
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages
    )
    return response['choices'][0]['message']['content']

def update_chat(messages, role, content):
    messages.append({"role": role, "content": content})
    return messages

Here is the chatbot script

# Streamlit Chatbot UI script

import pinecone
import streamlit as st
from streamlit_chat import message
from chat_utils1 import get_initial_message, update_chat, get_answer  # Import the get_answer function
import os
from dotenv import load_dotenv

load_dotenv('C:/Users/david/My Drive/Colab Notebooks/Building a GPT-4 Chatbot using ChatGPT API and Streamlit Chat/API.env')

# Pinecone initialization
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")

index_name = "langchain2"
index = pinecone.Index(index_name=index_name)  # Create the Pinecone instance

st.title("Chatbot : ChatGPT for Evaluators")
st.subheader("AI Evaluation Companion:")

model = st.selectbox(
    "Select a model",
    ("gpt-3.5-turbo", "gpt-4")
)
#Initialize the session states to store the generated messages, past queries, and the initial set of messages.

if 'generated' not in st.session_state:
    st.session_state['generated'] = []
if 'past' not in st.session_state:
    st.session_state['past'] = []

query = st.text_input("Query: ", key="input")

if 'messages' not in st.session_state:
    st.session_state['messages'] = get_initial_message()
#Process the user's query and generate the AI response.

if query:
    with st.spinner("generating..."):
        messages = st.session_state['messages']
        messages = update_chat(messages, "user", query)
        filter = {"pdf_id": {"$eq": "fd31d0e8"}}
        response = get_answer(query, filter=filter) # Replace get_chatgpt_response with get_answer
        messages = update_chat(messages, "assistant", response)
        st.session_state.past.append(query)
        st.session_state.generated.append(response)

#Display the chat messages and an expander to show the full message history.

if st.session_state['generated']:

    for i in range(len(st.session_state['generated'])-1, -1, -1):
        message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
        message(st.session_state["generated"][i], key=str(i))

    with st.expander("Show Messages"):
        st.write(messages)

Did you ever figure out what the problem/solution was?

1 Like