diff --git a/generate_dua_tags_embedding.py b/generate_dua_tags_embedding.py new file mode 100644 index 0000000..40cacdf --- /dev/null +++ b/generate_dua_tags_embedding.py @@ -0,0 +1,69 @@ +import json +from langchain_openai import OpenAIEmbeddings +# from langchain.vectorstores.pgvector import PGVector +from langchain_postgres import PGVector +# from langchain.schema import Document +from langchain_core.documents import Document +from dotenv import load_dotenv + +load_dotenv() + +# Database configuration +CONNECTION_STRING = 'postgresql+psycopg2://postgres:test@localhost:5433/vector_db' +COLLECTION_NAME = 'duas_tags_vectors' + +# Load JSON data +with open('duas_directus_published.json', 'r', encoding='utf-8') as f: + duas_data = json.load(f) + +print(f"Loaded {len(duas_data)} duas records") + +# Create documents with ONLY tags for embedding +documents = [] +for dua in duas_data: + # Extract tags - handle both list and string formats + tags = dua.get('tags', []) + + if isinstance(tags, list): + tags_text = ', '.join(tags) + else: + tags_text = str(tags) + + # Skip if no tags + if not tags_text or tags_text.strip() == '': + continue + + # Create document with ONLY tags as content + # Store all original data in metadata for retrieval + doc = Document( + page_content=tags_text, # Only tags will be embedded + metadata={ + 'id': dua.get('id'), + 'arabic': dua.get('arabic', ''), + 'transliteration': dua.get('transliteration', ''), + 'translation': dua.get('translation', ''), + 'urdu': dua.get('urdu', ''), + 'romanUrdu': dua.get('romanUrdu', ''), + 'category': dua.get('category', ''), + 'occasion': dua.get('occasion', ''), + 'source': dua.get('source', ''), + 'tags': tags # Keep original tags for reference + } + ) + documents.append(doc) + +print(f"Created {len(documents)} documents from tags") + +# Initialize embeddings +embeddings = OpenAIEmbeddings(model="text-embedding-3-small") + +# Create vector store - embeddings will be created only from tags (page_content) +print("Creating embeddings and storing in pgvector...") +db = PGVector.from_documents( + embedding=embeddings, + documents=documents, + collection_name=COLLECTION_NAME, + connection=CONNECTION_STRING +) + +print("✓ Vector store created successfully!")