duas tags embeddings are stored in pgvector database named vector_db

This commit is contained in:
hasnain 2025-11-02 18:14:56 +05:00
parent 7c66eaa059
commit 0a6e41d046

View File

@ -0,0 +1,69 @@
import json
from langchain_openai import OpenAIEmbeddings
# from langchain.vectorstores.pgvector import PGVector
from langchain_postgres import PGVector
# from langchain.schema import Document
from langchain_core.documents import Document
from dotenv import load_dotenv
load_dotenv()
# Database configuration
CONNECTION_STRING = 'postgresql+psycopg2://postgres:test@localhost:5433/vector_db'
COLLECTION_NAME = 'duas_tags_vectors'
# Load JSON data
with open('duas_directus_published.json', 'r', encoding='utf-8') as f:
duas_data = json.load(f)
print(f"Loaded {len(duas_data)} duas records")
# Create documents with ONLY tags for embedding
documents = []
for dua in duas_data:
# Extract tags - handle both list and string formats
tags = dua.get('tags', [])
if isinstance(tags, list):
tags_text = ', '.join(tags)
else:
tags_text = str(tags)
# Skip if no tags
if not tags_text or tags_text.strip() == '':
continue
# Create document with ONLY tags as content
# Store all original data in metadata for retrieval
doc = Document(
page_content=tags_text, # Only tags will be embedded
metadata={
'id': dua.get('id'),
'arabic': dua.get('arabic', ''),
'transliteration': dua.get('transliteration', ''),
'translation': dua.get('translation', ''),
'urdu': dua.get('urdu', ''),
'romanUrdu': dua.get('romanUrdu', ''),
'category': dua.get('category', ''),
'occasion': dua.get('occasion', ''),
'source': dua.get('source', ''),
'tags': tags # Keep original tags for reference
}
)
documents.append(doc)
print(f"Created {len(documents)} documents from tags")
# Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Create vector store - embeddings will be created only from tags (page_content)
print("Creating embeddings and storing in pgvector...")
db = PGVector.from_documents(
embedding=embeddings,
documents=documents,
collection_name=COLLECTION_NAME,
connection=CONNECTION_STRING
)
print("✓ Vector store created successfully!")