salam_bot/helpers/audio_analysis.py

import whisper
import torch
import argparse
import os
from typing import Dict, Tuple, Optional
import warnings

warnings.filterwarnings('ignore')

class UrduIntentExtractor:
    def __init__(self, model_size: str = "large-v3"):
        """
        Initialize Urdu intent extractor using Whisper

        Args:
            model_size: Whisper model size (tiny, base, small, medium, large)
        """
        print(f"Loading Whisper {model_size} model...")
        self.model = whisper.load_model(model_size)

        # Comprehensive intent mapping for Urdu and English
        self.intent_keywords = {
            "greeting": {
                "urdu": ["سلام", "السلام علیکم", "ہیلو", "آداب", "صبح بخیر", "شام بخیر"],
                "english": ["hello", "hi", "greetings", "good morning", "good evening", "assalam"]
            },
            "question": {
                "urdu": ["کیا", "کب", "کیوں", "کسے", "کہاں", "کس طرح", "کتنا", "کیسے"],
                "english": ["what", "when", "why", "who", "where", "how", "how much", "which"]
            },
            "request": {
                "urdu": ["براہ کرم", "مہربانی", "چاہتا ہوں", "چاہتی ہوں", "درکار ہے", "مدد چاہیے"],
                "english": ["please", "kindly", "want", "need", "require", "help", "could you", "would you"]
            },
            "command": {
                "urdu": ["کرو", "کریں", "لاؤ", "دیں", "بناؤ", "روکو", "جاؤ", "آؤ"],
                "english": ["do", "make", "bring", "give", "create", "stop", "go", "come"]
            },
            "complaint": {
                "urdu": ["شکایت", "مسئلہ", "پریشانی", "غلط", "خراب", "نقص", "برا"],
                "english": ["complaint", "problem", "issue", "wrong", "bad", "fault", "error"]
            },
            "information": {
                "urdu": ["بتائیں", "جانیں", "معلوم", "تفصیل", "رہنمائی", "بتاؤ"],
                "english": ["tell", "know", "information", "details", "guide", "explain"]
            },
            "emergency": {
                "urdu": ["حادثہ", "ایمرجنسی", "تباہی", "بچاؤ", "جلدی", "فوری", "خطرہ"],
                "english": ["accident", "emergency", "help", "urgent", "quick", "danger", "dangerous"]
            },
            "appointment": {
                "urdu": ["ملاقات", "اپائنٹمنٹ", "ٹائم", "تاریخ", "وقت", "دن"],
                "english": ["meeting", "appointment", "time", "date", "schedule", "day"]
            },
            "farewell": {
                "urdu": ["اللہ حافظ", "خدا حافظ", "بای", "اختتام", "ختم", "اگلی بار"],
                "english": ["goodbye", "bye", "farewell", "end", "see you", "next time"]
            },
            "thanks": {
                "urdu": ["شکریہ", "مہربانی", "آپ کا بہت شکریہ", "تھینکس"],
                "english": ["thank", "thanks", "grateful", "appreciate"]
            }
        }

    def transcribe_and_translate(self, audio_path: str) -> Dict[str, str]:
        """
        Transcribe Urdu audio and translate to English using Whisper

        Args:
            audio_path: Path to audio file

        Returns:
            Dictionary containing Urdu transcription and English translation
        """
        print(f"\nProcessing audio file: {os.path.basename(audio_path)}")

        # First, transcribe in Urdu
        print("Transcribing in Urdu...")
        urdu_result = self.model.transcribe(
            audio_path,
            language="ur",  # Force Urdu language
            task="transcribe",
            fp16=torch.cuda.is_available()
        )
        urdu_text = urdu_result["text"].strip()

        # Then, translate to English
        print("Translating to English...")
        english_result = self.model.transcribe(
            audio_path,
            language="ur",  # Source language is Urdu
            task="translate",  # This tells Whisper to translate
            fp16=torch.cuda.is_available()
        )
        english_text = english_result["text"].strip()

        return {
            "urdu": urdu_text,
            "english": english_text,
            "urdu_segments": urdu_result.get("segments", []),
            "english_segments": english_result.get("segments", [])
        }

    def extract_intent(self, urdu_text: str, english_text: str) -> Tuple[str, float, Dict]:
        """
        Extract main intent from both Urdu and English texts

        Args:
            urdu_text: Original Urdu transcription
            english_text: Translated English text

        Returns:
            Tuple of (intent, confidence, details)
        """
        print("\nAnalyzing intent...")

        # Prepare text for analysis
        urdu_lower = urdu_text.lower()
        english_lower = english_text.lower()

        # Calculate intent scores
        intent_scores = {}
        intent_details = {}

        for intent, keywords in self.intent_keywords.items():
            # Count Urdu keyword matches
            urdu_matches = []
            for keyword in keywords["urdu"]:
                if keyword in urdu_lower:
                    urdu_matches.append(keyword)

            # Count English keyword matches
            english_matches = []
            for keyword in keywords["english"]:
                if keyword.lower() in english_lower:
                    english_matches.append(keyword)

            # Calculate scores
            urdu_score = len(urdu_matches)
            english_score = len(english_matches)
            total_score = urdu_score + english_score

            if total_score > 0:
                intent_scores[intent] = total_score
                intent_details[intent] = {
                    "urdu_matches": urdu_matches,
                    "english_matches": english_matches,
                    "urdu_score": urdu_score,
                    "english_score": english_score,
                    "total_score": total_score
                }

        # Determine main intent
        if intent_scores:
            # Get intent with highest score
            main_intent = max(intent_scores, key=intent_scores.get)

            # Calculate confidence based on multiple factors
            total_words = len(english_lower.split()) + len(urdu_lower.split())
            base_confidence = intent_scores[main_intent] / max(1, total_words / 5)

            # Boost confidence if matches found in both languages
            if (intent_details[main_intent]["urdu_score"] > 0 and
                intent_details[main_intent]["english_score"] > 0):
                base_confidence *= 1.5

            confidence = min(base_confidence, 1.0)
        else:
            main_intent = "general_conversation"
            confidence = 0.3
            intent_details[main_intent] = {
                "urdu_matches": [],
                "english_matches": [],
                "urdu_score": 0,
                "english_score": 0,
                "total_score": 0
            }

        return main_intent, confidence, intent_details[main_intent]

    def get_intent_description(self, intent: str) -> str:
        """
        Get human-readable description for intent

        Args:
            intent: Detected intent

        Returns:
            Description string
        """
        descriptions = {
            "greeting": "👋 Greeting or starting a conversation",
            "question": "❓ Asking a question or seeking clarification",
            "request": "🙏 Making a request or asking for something",
            "command": "⚡ Giving a command or instruction",
            "complaint": "😠 Expressing a complaint or dissatisfaction",
            "information": "ℹ️ Seeking or providing information",
            "emergency": "🚨 Emergency situation requiring immediate attention",
            "appointment": "📅 Scheduling or inquiring about a meeting/appointment",
            "farewell": "👋 Ending the conversation",
            "thanks": "🙏 Expressing gratitude or thanks",
            "general_conversation": "💬 General conversation without specific intent"
        }
        return descriptions.get(intent, "💭 Unknown or general conversation")

    def analyze_sentiment(self, english_text: str) -> Tuple[str, float]:
        """
        Basic sentiment analysis based on keywords

        Args:
            english_text: English translated text

        Returns:
            Tuple of (sentiment, confidence)
        """
        positive_words = ["good", "great", "excellent", "happy", "thanks", "thank", "please",
                         "wonderful", "nice", "helpful", "appreciate", "love", "like"]
        negative_words = ["bad", "wrong", "problem", "issue", "complaint", "angry", "upset",
                         "terrible", "horrible", "hate", "not working", "broken", "failed"]

        text_lower = english_text.lower()

        positive_count = sum(1 for word in positive_words if word in text_lower)
        negative_count = sum(1 for word in negative_words if word in text_lower)

        if positive_count > negative_count:
            return "positive", positive_count / max(1, (positive_count + negative_count))
        elif negative_count > positive_count:
            return "negative", negative_count / max(1, (positive_count + negative_count))
        else:
            return "neutral", 0.5

    def process_audio_file(self, audio_path: str, verbose: bool = True) -> Dict:
        """
        Main function to process audio file and extract intent

        Args:
            audio_path: Path to audio file
            verbose: Whether to print detailed output

        Returns:
            Dictionary with all analysis results
        """
        # Validate file
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        # Transcribe and translate
        results = self.transcribe_and_translate(audio_path)

        # Extract intent
        intent, confidence, intent_details = self.extract_intent(
            results["urdu"],
            results["english"]
        )

        # Analyze sentiment
        sentiment, sentiment_confidence = self.analyze_sentiment(results["english"])

        # Prepare final results
        final_results = {
            "file": os.path.basename(audio_path),
            "transcription": {
                "urdu": results["urdu"],
                "english": results["english"]
            },
            "intent": {
                "type": intent,
                "confidence": confidence,
                "description": self.get_intent_description(intent),
                "details": intent_details
            },
            "sentiment": {
                "type": sentiment,
                "confidence": sentiment_confidence
            },
            "segments": {
                "urdu": results.get("urdu_segments", []),
                "english": results.get("english_segments", [])
            }
        }

        # Print results if verbose
        if verbose:
            self.print_results(final_results)

        return final_results

    def print_results(self, results: Dict):
        """
        Print analysis results in a formatted way
        """
        print("\n" + "="*70)
        print("URDU SPEECH INTENT ANALYSIS RESULTS")
        print("="*70)

        print(f"\n📁 File: {results['file']}")

        print(f"\n🗣️ URDU TRANSCRIPTION:")
        print(f"   {results['transcription']['urdu']}")

        print(f"\n🌐 ENGLISH TRANSLATION:")
        print(f"   {results['transcription']['english']}")

        print(f"\n🎯 DETECTED INTENT:")
        print(f"   {results['intent']['description']}")
        print(f"   Confidence: {results['intent']['confidence']:.1%}")

        if results['intent']['details']['urdu_matches']:
            print(f"   Urdu keywords found: {', '.join(results['intent']['details']['urdu_matches'])}")
        if results['intent']['details']['english_matches']:
            print(f"   English keywords found: {', '.join(results['intent']['details']['english_matches'])}")

        print(f"\n😊 SENTIMENT:")
        print(f"   {results['sentiment']['type'].upper()}")
        print(f"   Confidence: {results['sentiment']['confidence']:.1%}")

        print("\n" + "="*70)