Import necessary libraries
import streamlit as st
import os
from langchain.document_loaders import TextLoader, PyPDFLoader, JSONLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import LLMChain, RetrievalQA
from pprint import pprint
from langchain import PromptTemplate
import ijson
import itertools
import pinecone
import itertools
from sentence_transformers import SentenceTransformer
import uuid
import configparser
from io import BytesIO
import PyPDF2
import re
from transformers import AutoTokenizer
#For Observability, check out Arize AI
Create a ConfigParser object and read the config.ini file
config = configparser.ConfigParser()
config.read(‘configurations.ini’)
openai_api_key = config.get(‘OPENAI_Settings’, ‘OPENAI_API_KEY’)
pinecone_env_key = config.get(‘Pinecone_Settings’, ‘PINECONE_ENVIRONMENT’)
pinecone_api_key = config.get(‘Pinecone_Settings’, ‘PINECONE_API_KEY’)
pinecone.init(api_key=pinecone_api_key, environment=pinecone_env_key)
prompt_template = “”"
You are a doctor and have to give a patient friendly description for a surgery described in this query: {query}. Use the content below to make the response longer then 200 words but shorter than 500. Make sure to have the explanation be
easily understood by someone with very little medical knowledge.
Content: {context}
Description: """
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
PROMPT = PromptTemplate(template = prompt_template, input_variables = [“query”, “context”])
chain = LLMChain(llm=llm, prompt=PROMPT)
def doc_embeddings(file):
model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v4_MiniLM-L6')
embeddings = model.encode(file)
return embeddings
def textProcessing(files):
loader = TextLoader(files)
docs = loader.load()
split(docs, 1000)
def pdfProcessing(files):
with BytesIO(files.read()) as file_obj:
# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(file_obj)
# Read the text content of the PDF
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
first_clean = re.sub(r'\n+', ' ', text) # Remove line breaks
cleaned_text = re.sub(r'\s+', ' ', first_clean) # Remove extra spaces
# Print the text content
split(cleaned_text, 1000)
def split(data, chunk_size):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=0, separators=[" ", “,”, “\n”]
)
chunks = text_splitter.create_documents([data])
pinecone_upsert(chunks)
def pinecone_upsert(chunks):
index = pinecone.Index(‘totalcare’)
pinecone_vectors =
for j,chunk in enumerate(chunks):
print("\tCreating embedding for chunk ", j, "of ", len(chunks))
vector = doc_embeddings(chunk.page_content)
myuuid = uuid.uuid4()
# add vector to pinecone_vectors list
print("\tAdding vector to pinecone_vectors list for chunk ", j, " of ", len(chunks))
pinecone_vectors.append((str(myuuid), vector))
if len(pinecone_vectors) % 100 == 0:
print("Upserting batch of 100 vectors...")
upsert_response = index.upsert(vectors=pinecone_vectors)
pinecone_vectors = []
# if there are any vectors left, upsert them
if len(pinecone_vectors) > 0:
print("Upserting remaining vectors...")
upsert_response = index.upsert(vectors=pinecone_vectors)
pinecone_vectors = []
print("Vector upload complete.")