from langchain.llms import OpenAI
from sentence_transformers import SentenceTransformer
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
import os
import glob
import pdfplumber
import pinecone
import numpy as np
load_dotenv()
# Initialize the language model
llm = OpenAI(temperature=0)
model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize embeddings model
embeddings_model = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
# Initialize Pinecone
pinecone.init(
api_key=os.getenv("PINECONE_API_KEY"),
environment='my_environment**`strong text`**'
)
index = pinecone.Index('my_index')
# Directory to retrieve your files from
directory_path = 'docs'
# Get list of PDF files in the directory
pdf_files = glob.glob(f"{directory_path}/*.pdf")
def split_into_chunks(text, chunk_size=1000, overlap_size=200):
"""
Splits the text into overlapping chunks of a given size.
"""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += chunk_size - overlap_size
return chunks
for pdf_file in pdf_files:
# Load the PDF
with pdfplumber.open(pdf_file) as pdf:
raw_text = '\n'.join(page.extract_text() for page in pdf.pages)
# Split the text into chunks
chunks = split_into_chunks(raw_text)
# Create embeddings for each chunk and store them in Pinecone
for i, chunk in enumerate(chunks):
embedding = model.encode([chunk])[0]
id = f"{os.path.basename(pdf_file)}_chunk_{i}"
index.upsert(ids=[id], vectors=[embedding.tolist()]) # Convert the numpy array to a list
print("PDFs processed and stored in Pinecone.")
I am a complete newbie, trying to train a chatbot using Pinecone, to be honest I do not know what I am doing, I am taking help of chatgpt4, but I am at a loop at the moment.
This is the error i am getting " raise ValueError(f"Invalid vector value passed: cannot interpret type {type(item)}β)
ValueError: Invalid vector value passed: cannot interpret type <class βlistβ>β