this below is the embedding code
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from pinecone_text.sparse import SpladeEncoder
import json
import torch
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(name)
Initialize API keys
client = OpenAI(api_key=“”)
pc = Pinecone(api_key=“”)
Constants
INDEX_NAME = “news”
DENSE_DIMENSION = 3072 # OpenAI text-embedding-3-large dimension
def create_pinecone_index():
“”“Create Pinecone index if it doesn’t exist.”“”
if INDEX_NAME not in pc.list_indexes().names():
pc.create_index(
name=INDEX_NAME,
dimension=DENSE_DIMENSION,
metric=“dotproduct”,
spec=ServerlessSpec(
cloud=“aws”,
region=“us-east-1”
)
)
return pc.Index(INDEX_NAME)
def get_dense_embedding(text):
“”“Get dense embedding using OpenAI’s text-embedding-3-large model.”“”
response = client.embeddings.create(
input=text,
model=“text-embedding-3-large”
)
return response.data[0].embedding
def initialize_splade_encoder(device):
“”“Initialize SpladeEncoder.”“”
splade = SpladeEncoder(device=device)
return splade
def get_sparse_embedding(splade_encoder, text):
“”“Get sparse embedding using SpladeEncoder.”“”
sparse_vector = splade_encoder.encode_documents(text)
return {
“indices”: [int(i) for i in sparse_vector[‘indices’]],
“values”: sparse_vector[‘values’]
}
def upsert_to_pinecone(index, splade_encoder, text):
try:
tweet_text = text[‘eng_transcript’]
dense_vector = get_dense_embedding(tweet_text)
sparse_vector = get_sparse_embedding(splade_encoder, tweet_text)
index.upsert(
vectors=[
{
"id": str(text['id']), # Use 'id' instead of 'sentiment'
"values": dense_vector,
"sparse_values": sparse_vector,
"metadata": {"text": tweet_text, "sentiment": text['sentiment']}
}
]
)
logger.info(f"Successfully upserted record with ID: {text['id']}")
except Exception as e:
logger.error(f"Error upserting record with ID {text['id']}: {str(e)}")
def main():
# Create or get existing Pinecone index
index = create_pinecone_index()
# Load tweet data
with open('fixed.json', 'r', encoding='utf-8') as file:
data = json.load(file)
# Initialize SpladeEncoder
device = 'cuda' if torch.cuda.is_available() else 'cpu'
splade_encoder = initialize_splade_encoder(device)
# Upsert tweets to Pinecone
for text in data['entries']:
upsert_to_pinecone(index, splade_encoder, text)
print("Embedding and upserting process completed.")
if name == “main”:
main()
this below is query code
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from pinecone_text.sparse import SpladeEncoder
import json
import torch
import logging
Initialize clients
client = OpenAI(api_key=“”)
pc = Pinecone(api_key=“”)
Constants
INDEX_NAME = “news”
def get_dense_embedding(text):
“”“Get dense embedding using OpenAI’s text-embedding-3-large model.”“”
response = client.embeddings.create(
input=text,
model=“text-embedding-3-large”
)
return response.data[0].embedding
def get_sparse_embedding(splade_encoder, text):
“”“Get sparse embedding using SpladeEncoder.”“”
sparse_vector = splade_encoder.encode_documents(text)
return {
“indices”: [int(i) for i in sparse_vector[‘indices’]],
“values”: sparse_vector[‘values’]
}
def hybrid_score_norm(dense, sparse, alpha: float):
“”“Hybrid score using a convex combination.”“”
if alpha < 0 or alpha > 1:
raise ValueError(“Alpha must be between 0 and 1”)
hs = {
‘indices’: sparse[‘indices’],
‘values’: [v * (1 - alpha) for v in sparse[‘values’]]
}
return [v * alpha for v in dense], hs
def hybrid_search(query_text, top_k=10, alpha=0.01):
# Initialize SpladeEncoder
device = ‘cuda’ if torch.cuda.is_available() else ‘cpu’
splade_encoder = SpladeEncoder(device=device)
# Get dense and sparse embeddings
dense_vector = get_dense_embedding(query_text)
sparse_vector = get_sparse_embedding(splade_encoder, query_text)
# Apply hybrid scoring
hdense, hsparse = hybrid_score_norm(dense_vector, sparse_vector, alpha=0.01)
# Perform hybrid search
index = pc.Index(INDEX_NAME)
query_response = index.query(
vector=hdense,
sparse_vector=hsparse,
top_k=top_k,
include_metadata=True
)
return query_response
Example usage
query_text = “flying objects”
results = hybrid_search(query_text)
Process and print results
for match in results[‘matches’]:
print(f"ID: {match[‘id’]}“)
print(f"Score: {match[‘score’]}”)
print(f"Text: {match[‘metadata’][‘text’]}“)
print(f"Sentiment: {match[‘metadata’][‘sentiment’]}”)
print(“—”)