duas tags embeddings are stored in pgvector database named vector_db
This commit is contained in:
parent
7c66eaa059
commit
0a6e41d046
69
generate_dua_tags_embedding.py
Normal file
69
generate_dua_tags_embedding.py
Normal file
@ -0,0 +1,69 @@
|
||||
import json
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
# from langchain.vectorstores.pgvector import PGVector
|
||||
from langchain_postgres import PGVector
|
||||
# from langchain.schema import Document
|
||||
from langchain_core.documents import Document
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Database configuration
|
||||
CONNECTION_STRING = 'postgresql+psycopg2://postgres:test@localhost:5433/vector_db'
|
||||
COLLECTION_NAME = 'duas_tags_vectors'
|
||||
|
||||
# Load JSON data
|
||||
with open('duas_directus_published.json', 'r', encoding='utf-8') as f:
|
||||
duas_data = json.load(f)
|
||||
|
||||
print(f"Loaded {len(duas_data)} duas records")
|
||||
|
||||
# Create documents with ONLY tags for embedding
|
||||
documents = []
|
||||
for dua in duas_data:
|
||||
# Extract tags - handle both list and string formats
|
||||
tags = dua.get('tags', [])
|
||||
|
||||
if isinstance(tags, list):
|
||||
tags_text = ', '.join(tags)
|
||||
else:
|
||||
tags_text = str(tags)
|
||||
|
||||
# Skip if no tags
|
||||
if not tags_text or tags_text.strip() == '':
|
||||
continue
|
||||
|
||||
# Create document with ONLY tags as content
|
||||
# Store all original data in metadata for retrieval
|
||||
doc = Document(
|
||||
page_content=tags_text, # Only tags will be embedded
|
||||
metadata={
|
||||
'id': dua.get('id'),
|
||||
'arabic': dua.get('arabic', ''),
|
||||
'transliteration': dua.get('transliteration', ''),
|
||||
'translation': dua.get('translation', ''),
|
||||
'urdu': dua.get('urdu', ''),
|
||||
'romanUrdu': dua.get('romanUrdu', ''),
|
||||
'category': dua.get('category', ''),
|
||||
'occasion': dua.get('occasion', ''),
|
||||
'source': dua.get('source', ''),
|
||||
'tags': tags # Keep original tags for reference
|
||||
}
|
||||
)
|
||||
documents.append(doc)
|
||||
|
||||
print(f"Created {len(documents)} documents from tags")
|
||||
|
||||
# Initialize embeddings
|
||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
||||
|
||||
# Create vector store - embeddings will be created only from tags (page_content)
|
||||
print("Creating embeddings and storing in pgvector...")
|
||||
db = PGVector.from_documents(
|
||||
embedding=embeddings,
|
||||
documents=documents,
|
||||
collection_name=COLLECTION_NAME,
|
||||
connection=CONNECTION_STRING
|
||||
)
|
||||
|
||||
print("✓ Vector store created successfully!")
|
||||
Loading…
x
Reference in New Issue
Block a user