diff --git a/helpers/audio_analysis.py b/helpers/audio_analysis.py new file mode 100644 index 0000000..1aacee8 --- /dev/null +++ b/helpers/audio_analysis.py @@ -0,0 +1,318 @@ +import whisper +import torch +import argparse +import os +from typing import Dict, Tuple, Optional +import warnings + +warnings.filterwarnings('ignore') + +class UrduIntentExtractor: + def __init__(self, model_size: str = "large-v3"): + """ + Initialize Urdu intent extractor using Whisper + + Args: + model_size: Whisper model size (tiny, base, small, medium, large) + """ + print(f"Loading Whisper {model_size} model...") + self.model = whisper.load_model(model_size) + + # Comprehensive intent mapping for Urdu and English + self.intent_keywords = { + "greeting": { + "urdu": ["سلام", "السلام علیکم", "ہیلو", "آداب", "صبح بخیر", "شام بخیر"], + "english": ["hello", "hi", "greetings", "good morning", "good evening", "assalam"] + }, + "question": { + "urdu": ["کیا", "کب", "کیوں", "کسے", "کہاں", "کس طرح", "کتنا", "کیسے"], + "english": ["what", "when", "why", "who", "where", "how", "how much", "which"] + }, + "request": { + "urdu": ["براہ کرم", "مہربانی", "چاہتا ہوں", "چاہتی ہوں", "درکار ہے", "مدد چاہیے"], + "english": ["please", "kindly", "want", "need", "require", "help", "could you", "would you"] + }, + "command": { + "urdu": ["کرو", "کریں", "لاؤ", "دیں", "بناؤ", "روکو", "جاؤ", "آؤ"], + "english": ["do", "make", "bring", "give", "create", "stop", "go", "come"] + }, + "complaint": { + "urdu": ["شکایت", "مسئلہ", "پریشانی", "غلط", "خراب", "نقص", "برا"], + "english": ["complaint", "problem", "issue", "wrong", "bad", "fault", "error"] + }, + "information": { + "urdu": ["بتائیں", "جانیں", "معلوم", "تفصیل", "رہنمائی", "بتاؤ"], + "english": ["tell", "know", "information", "details", "guide", "explain"] + }, + "emergency": { + "urdu": ["حادثہ", "ایمرجنسی", "تباہی", "بچاؤ", "جلدی", "فوری", "خطرہ"], + "english": ["accident", "emergency", "help", "urgent", "quick", "danger", "dangerous"] + }, + "appointment": { + "urdu": ["ملاقات", "اپائنٹمنٹ", "ٹائم", "تاریخ", "وقت", "دن"], + "english": ["meeting", "appointment", "time", "date", "schedule", "day"] + }, + "farewell": { + "urdu": ["اللہ حافظ", "خدا حافظ", "بای", "اختتام", "ختم", "اگلی بار"], + "english": ["goodbye", "bye", "farewell", "end", "see you", "next time"] + }, + "thanks": { + "urdu": ["شکریہ", "مہربانی", "آپ کا بہت شکریہ", "تھینکس"], + "english": ["thank", "thanks", "grateful", "appreciate"] + } + } + + def transcribe_and_translate(self, audio_path: str) -> Dict[str, str]: + """ + Transcribe Urdu audio and translate to English using Whisper + + Args: + audio_path: Path to audio file + + Returns: + Dictionary containing Urdu transcription and English translation + """ + print(f"\nProcessing audio file: {os.path.basename(audio_path)}") + + # First, transcribe in Urdu + print("Transcribing in Urdu...") + urdu_result = self.model.transcribe( + audio_path, + language="ur", # Force Urdu language + task="transcribe", + fp16=torch.cuda.is_available() + ) + urdu_text = urdu_result["text"].strip() + + # Then, translate to English + print("Translating to English...") + english_result = self.model.transcribe( + audio_path, + language="ur", # Source language is Urdu + task="translate", # This tells Whisper to translate + fp16=torch.cuda.is_available() + ) + english_text = english_result["text"].strip() + + return { + "urdu": urdu_text, + "english": english_text, + "urdu_segments": urdu_result.get("segments", []), + "english_segments": english_result.get("segments", []) + } + + def extract_intent(self, urdu_text: str, english_text: str) -> Tuple[str, float, Dict]: + """ + Extract main intent from both Urdu and English texts + + Args: + urdu_text: Original Urdu transcription + english_text: Translated English text + + Returns: + Tuple of (intent, confidence, details) + """ + print("\nAnalyzing intent...") + + # Prepare text for analysis + urdu_lower = urdu_text.lower() + english_lower = english_text.lower() + + # Calculate intent scores + intent_scores = {} + intent_details = {} + + for intent, keywords in self.intent_keywords.items(): + # Count Urdu keyword matches + urdu_matches = [] + for keyword in keywords["urdu"]: + if keyword in urdu_lower: + urdu_matches.append(keyword) + + # Count English keyword matches + english_matches = [] + for keyword in keywords["english"]: + if keyword.lower() in english_lower: + english_matches.append(keyword) + + # Calculate scores + urdu_score = len(urdu_matches) + english_score = len(english_matches) + total_score = urdu_score + english_score + + if total_score > 0: + intent_scores[intent] = total_score + intent_details[intent] = { + "urdu_matches": urdu_matches, + "english_matches": english_matches, + "urdu_score": urdu_score, + "english_score": english_score, + "total_score": total_score + } + + # Determine main intent + if intent_scores: + # Get intent with highest score + main_intent = max(intent_scores, key=intent_scores.get) + + # Calculate confidence based on multiple factors + total_words = len(english_lower.split()) + len(urdu_lower.split()) + base_confidence = intent_scores[main_intent] / max(1, total_words / 5) + + # Boost confidence if matches found in both languages + if (intent_details[main_intent]["urdu_score"] > 0 and + intent_details[main_intent]["english_score"] > 0): + base_confidence *= 1.5 + + confidence = min(base_confidence, 1.0) + else: + main_intent = "general_conversation" + confidence = 0.3 + intent_details[main_intent] = { + "urdu_matches": [], + "english_matches": [], + "urdu_score": 0, + "english_score": 0, + "total_score": 0 + } + + return main_intent, confidence, intent_details[main_intent] + + def get_intent_description(self, intent: str) -> str: + """ + Get human-readable description for intent + + Args: + intent: Detected intent + + Returns: + Description string + """ + descriptions = { + "greeting": "👋 Greeting or starting a conversation", + "question": "❓ Asking a question or seeking clarification", + "request": "🙏 Making a request or asking for something", + "command": "⚡ Giving a command or instruction", + "complaint": "😠 Expressing a complaint or dissatisfaction", + "information": "ℹ️ Seeking or providing information", + "emergency": "🚨 Emergency situation requiring immediate attention", + "appointment": "📅 Scheduling or inquiring about a meeting/appointment", + "farewell": "👋 Ending the conversation", + "thanks": "🙏 Expressing gratitude or thanks", + "general_conversation": "💬 General conversation without specific intent" + } + return descriptions.get(intent, "💭 Unknown or general conversation") + + def analyze_sentiment(self, english_text: str) -> Tuple[str, float]: + """ + Basic sentiment analysis based on keywords + + Args: + english_text: English translated text + + Returns: + Tuple of (sentiment, confidence) + """ + positive_words = ["good", "great", "excellent", "happy", "thanks", "thank", "please", + "wonderful", "nice", "helpful", "appreciate", "love", "like"] + negative_words = ["bad", "wrong", "problem", "issue", "complaint", "angry", "upset", + "terrible", "horrible", "hate", "not working", "broken", "failed"] + + text_lower = english_text.lower() + + positive_count = sum(1 for word in positive_words if word in text_lower) + negative_count = sum(1 for word in negative_words if word in text_lower) + + if positive_count > negative_count: + return "positive", positive_count / max(1, (positive_count + negative_count)) + elif negative_count > positive_count: + return "negative", negative_count / max(1, (positive_count + negative_count)) + else: + return "neutral", 0.5 + + def process_audio_file(self, audio_path: str, verbose: bool = True) -> Dict: + """ + Main function to process audio file and extract intent + + Args: + audio_path: Path to audio file + verbose: Whether to print detailed output + + Returns: + Dictionary with all analysis results + """ + # Validate file + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + + # Transcribe and translate + results = self.transcribe_and_translate(audio_path) + + # Extract intent + intent, confidence, intent_details = self.extract_intent( + results["urdu"], + results["english"] + ) + + # Analyze sentiment + sentiment, sentiment_confidence = self.analyze_sentiment(results["english"]) + + # Prepare final results + final_results = { + "file": os.path.basename(audio_path), + "transcription": { + "urdu": results["urdu"], + "english": results["english"] + }, + "intent": { + "type": intent, + "confidence": confidence, + "description": self.get_intent_description(intent), + "details": intent_details + }, + "sentiment": { + "type": sentiment, + "confidence": sentiment_confidence + }, + "segments": { + "urdu": results.get("urdu_segments", []), + "english": results.get("english_segments", []) + } + } + + # Print results if verbose + if verbose: + self.print_results(final_results) + + return final_results + + def print_results(self, results: Dict): + """ + Print analysis results in a formatted way + """ + print("\n" + "="*70) + print("URDU SPEECH INTENT ANALYSIS RESULTS") + print("="*70) + + print(f"\n📁 File: {results['file']}") + + print(f"\n🗣️ URDU TRANSCRIPTION:") + print(f" {results['transcription']['urdu']}") + + print(f"\n🌐 ENGLISH TRANSLATION:") + print(f" {results['transcription']['english']}") + + print(f"\n🎯 DETECTED INTENT:") + print(f" {results['intent']['description']}") + print(f" Confidence: {results['intent']['confidence']:.1%}") + + if results['intent']['details']['urdu_matches']: + print(f" Urdu keywords found: {', '.join(results['intent']['details']['urdu_matches'])}") + if results['intent']['details']['english_matches']: + print(f" English keywords found: {', '.join(results['intent']['details']['english_matches'])}") + + print(f"\n😊 SENTIMENT:") + print(f" {results['sentiment']['type'].upper()}") + print(f" Confidence: {results['sentiment']['confidence']:.1%}") + + print("\n" + "="*70) \ No newline at end of file