Guys it’s my first time using VDB and i can’t understand what’s wrong with api. Here’s my code and the output.
import os
import requests
from bs4 import BeautifulSoup
import re
from fake_useragent import UserAgent
import openai
import pinecone
# Initialize Pinecone
pc = Pinecone(api_key='API-KEY')
index_name = "web"
# Check if the index exists, otherwise create one
if index_name not in pc.list_indexes().names():
pinecone.create_index(name=index_name, dimension=1536) # Ensure dimension matches your embedding dimension
index = pinecone.Index(index_name, host='https://web-mcilcxv.svc.aped-4627-b74a.pinecone.io')
# Securely fetch your OpenAI API key
openai.api_key = openai_key
# List of website URLs
website_urls = [
"https://www.accel.com", "https://www.a16z.com", "https://www.greylock.com",
"https://www.benchmark.com", "https://www.sequoiacap.com", "https://www.indexventures.com",
"https://www.kpcb.com", "https://www.lsvp.com", "https://www.matrixpartners.com",
"https://www.500.co", "https://www.sparkcapital.com", "https://www.insightpartners.com"
]
def scrape_home_page_and_save_text(url):
try:
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
text_content = re.sub(r'\s+', ' ', soup.get_text()).strip()
print(f"Scraped {url} successfully.")
return text_content
else:
print(f"Failed to scrape {url}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"An error occurred while scraping {url}: {str(e)}")
return None
def embed_text_content(text_content):
try:
response = openai.Embedding.create(input=text_content, model="text-embedding-ada-002")
return response['data'][0]['embedding'] # Accessing embedding data correctly
except openai.Error as e: # Catching OpenAI specific errors
print(f"An error occurred while generating embedding: {str(e)}")
return None
# Scrape, embed, and store data
embeddings = []
ids = []
for url in website_urls:
print(f"Scraping {url}...")
text_content = scrape_home_page_and_save_text(url)
if text_content:
print(f"Embedding content from {url}...")
embedding = embed_text_content(text_content)
if embedding:
embeddings.append(list(embedding))
ids.append(url)
# Batch upsert to Pinecone
if embeddings and ids:
index.upsert(vectors=list(zip(ids, embeddings)))
print(f"Data for all sites uploaded to Pinecone.")
print("Scraping, embedding, and uploading finished.")
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 01 May 2024 10:37:34 GMT', 'Content-Type': 'text/plain', 'Content-Length': '9', 'Connection': 'keep-alive', 'x-pinecone-auth-rejected-reason': 'Wrong API key', 'www-authenticate': 'Wrong API key', 'server': 'envoy'})
HTTP response body: Forbidden