salam_bot/helpers/audio_analysis.py

318 lines
13 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import whisper
import torch
import argparse
import os
from typing import Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')
class UrduIntentExtractor:
def __init__(self, model_size: str = "large-v3"):
"""
Initialize Urdu intent extractor using Whisper
Args:
model_size: Whisper model size (tiny, base, small, medium, large)
"""
print(f"Loading Whisper {model_size} model...")
self.model = whisper.load_model(model_size)
# Comprehensive intent mapping for Urdu and English
self.intent_keywords = {
"greeting": {
"urdu": ["سلام", "السلام علیکم", "ہیلو", "آداب", "صبح بخیر", "شام بخیر"],
"english": ["hello", "hi", "greetings", "good morning", "good evening", "assalam"]
},
"question": {
"urdu": ["کیا", "کب", "کیوں", "کسے", "کہاں", "کس طرح", "کتنا", "کیسے"],
"english": ["what", "when", "why", "who", "where", "how", "how much", "which"]
},
"request": {
"urdu": ["براہ کرم", "مہربانی", "چاہتا ہوں", "چاہتی ہوں", "درکار ہے", "مدد چاہیے"],
"english": ["please", "kindly", "want", "need", "require", "help", "could you", "would you"]
},
"command": {
"urdu": ["کرو", "کریں", "لاؤ", "دیں", "بناؤ", "روکو", "جاؤ", "آؤ"],
"english": ["do", "make", "bring", "give", "create", "stop", "go", "come"]
},
"complaint": {
"urdu": ["شکایت", "مسئلہ", "پریشانی", "غلط", "خراب", "نقص", "برا"],
"english": ["complaint", "problem", "issue", "wrong", "bad", "fault", "error"]
},
"information": {
"urdu": ["بتائیں", "جانیں", "معلوم", "تفصیل", "رہنمائی", "بتاؤ"],
"english": ["tell", "know", "information", "details", "guide", "explain"]
},
"emergency": {
"urdu": ["حادثہ", "ایمرجنسی", "تباہی", "بچاؤ", "جلدی", "فوری", "خطرہ"],
"english": ["accident", "emergency", "help", "urgent", "quick", "danger", "dangerous"]
},
"appointment": {
"urdu": ["ملاقات", "اپائنٹمنٹ", "ٹائم", "تاریخ", "وقت", "دن"],
"english": ["meeting", "appointment", "time", "date", "schedule", "day"]
},
"farewell": {
"urdu": ["اللہ حافظ", "خدا حافظ", "بای", "اختتام", "ختم", "اگلی بار"],
"english": ["goodbye", "bye", "farewell", "end", "see you", "next time"]
},
"thanks": {
"urdu": ["شکریہ", "مہربانی", "آپ کا بہت شکریہ", "تھینکس"],
"english": ["thank", "thanks", "grateful", "appreciate"]
}
}
def transcribe_and_translate(self, audio_path: str) -> Dict[str, str]:
"""
Transcribe Urdu audio and translate to English using Whisper
Args:
audio_path: Path to audio file
Returns:
Dictionary containing Urdu transcription and English translation
"""
print(f"\nProcessing audio file: {os.path.basename(audio_path)}")
# First, transcribe in Urdu
print("Transcribing in Urdu...")
urdu_result = self.model.transcribe(
audio_path,
language="ur", # Force Urdu language
task="transcribe",
fp16=torch.cuda.is_available()
)
urdu_text = urdu_result["text"].strip()
# Then, translate to English
print("Translating to English...")
english_result = self.model.transcribe(
audio_path,
language="ur", # Source language is Urdu
task="translate", # This tells Whisper to translate
fp16=torch.cuda.is_available()
)
english_text = english_result["text"].strip()
return {
"urdu": urdu_text,
"english": english_text,
"urdu_segments": urdu_result.get("segments", []),
"english_segments": english_result.get("segments", [])
}
def extract_intent(self, urdu_text: str, english_text: str) -> Tuple[str, float, Dict]:
"""
Extract main intent from both Urdu and English texts
Args:
urdu_text: Original Urdu transcription
english_text: Translated English text
Returns:
Tuple of (intent, confidence, details)
"""
print("\nAnalyzing intent...")
# Prepare text for analysis
urdu_lower = urdu_text.lower()
english_lower = english_text.lower()
# Calculate intent scores
intent_scores = {}
intent_details = {}
for intent, keywords in self.intent_keywords.items():
# Count Urdu keyword matches
urdu_matches = []
for keyword in keywords["urdu"]:
if keyword in urdu_lower:
urdu_matches.append(keyword)
# Count English keyword matches
english_matches = []
for keyword in keywords["english"]:
if keyword.lower() in english_lower:
english_matches.append(keyword)
# Calculate scores
urdu_score = len(urdu_matches)
english_score = len(english_matches)
total_score = urdu_score + english_score
if total_score > 0:
intent_scores[intent] = total_score
intent_details[intent] = {
"urdu_matches": urdu_matches,
"english_matches": english_matches,
"urdu_score": urdu_score,
"english_score": english_score,
"total_score": total_score
}
# Determine main intent
if intent_scores:
# Get intent with highest score
main_intent = max(intent_scores, key=intent_scores.get)
# Calculate confidence based on multiple factors
total_words = len(english_lower.split()) + len(urdu_lower.split())
base_confidence = intent_scores[main_intent] / max(1, total_words / 5)
# Boost confidence if matches found in both languages
if (intent_details[main_intent]["urdu_score"] > 0 and
intent_details[main_intent]["english_score"] > 0):
base_confidence *= 1.5
confidence = min(base_confidence, 1.0)
else:
main_intent = "general_conversation"
confidence = 0.3
intent_details[main_intent] = {
"urdu_matches": [],
"english_matches": [],
"urdu_score": 0,
"english_score": 0,
"total_score": 0
}
return main_intent, confidence, intent_details[main_intent]
def get_intent_description(self, intent: str) -> str:
"""
Get human-readable description for intent
Args:
intent: Detected intent
Returns:
Description string
"""
descriptions = {
"greeting": "👋 Greeting or starting a conversation",
"question": "❓ Asking a question or seeking clarification",
"request": "🙏 Making a request or asking for something",
"command": "⚡ Giving a command or instruction",
"complaint": "😠 Expressing a complaint or dissatisfaction",
"information": " Seeking or providing information",
"emergency": "🚨 Emergency situation requiring immediate attention",
"appointment": "📅 Scheduling or inquiring about a meeting/appointment",
"farewell": "👋 Ending the conversation",
"thanks": "🙏 Expressing gratitude or thanks",
"general_conversation": "💬 General conversation without specific intent"
}
return descriptions.get(intent, "💭 Unknown or general conversation")
def analyze_sentiment(self, english_text: str) -> Tuple[str, float]:
"""
Basic sentiment analysis based on keywords
Args:
english_text: English translated text
Returns:
Tuple of (sentiment, confidence)
"""
positive_words = ["good", "great", "excellent", "happy", "thanks", "thank", "please",
"wonderful", "nice", "helpful", "appreciate", "love", "like"]
negative_words = ["bad", "wrong", "problem", "issue", "complaint", "angry", "upset",
"terrible", "horrible", "hate", "not working", "broken", "failed"]
text_lower = english_text.lower()
positive_count = sum(1 for word in positive_words if word in text_lower)
negative_count = sum(1 for word in negative_words if word in text_lower)
if positive_count > negative_count:
return "positive", positive_count / max(1, (positive_count + negative_count))
elif negative_count > positive_count:
return "negative", negative_count / max(1, (positive_count + negative_count))
else:
return "neutral", 0.5
def process_audio_file(self, audio_path: str, verbose: bool = True) -> Dict:
"""
Main function to process audio file and extract intent
Args:
audio_path: Path to audio file
verbose: Whether to print detailed output
Returns:
Dictionary with all analysis results
"""
# Validate file
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# Transcribe and translate
results = self.transcribe_and_translate(audio_path)
# Extract intent
intent, confidence, intent_details = self.extract_intent(
results["urdu"],
results["english"]
)
# Analyze sentiment
sentiment, sentiment_confidence = self.analyze_sentiment(results["english"])
# Prepare final results
final_results = {
"file": os.path.basename(audio_path),
"transcription": {
"urdu": results["urdu"],
"english": results["english"]
},
"intent": {
"type": intent,
"confidence": confidence,
"description": self.get_intent_description(intent),
"details": intent_details
},
"sentiment": {
"type": sentiment,
"confidence": sentiment_confidence
},
"segments": {
"urdu": results.get("urdu_segments", []),
"english": results.get("english_segments", [])
}
}
# Print results if verbose
if verbose:
self.print_results(final_results)
return final_results
def print_results(self, results: Dict):
"""
Print analysis results in a formatted way
"""
print("\n" + "="*70)
print("URDU SPEECH INTENT ANALYSIS RESULTS")
print("="*70)
print(f"\n📁 File: {results['file']}")
print(f"\n🗣️ URDU TRANSCRIPTION:")
print(f" {results['transcription']['urdu']}")
print(f"\n🌐 ENGLISH TRANSLATION:")
print(f" {results['transcription']['english']}")
print(f"\n🎯 DETECTED INTENT:")
print(f" {results['intent']['description']}")
print(f" Confidence: {results['intent']['confidence']:.1%}")
if results['intent']['details']['urdu_matches']:
print(f" Urdu keywords found: {', '.join(results['intent']['details']['urdu_matches'])}")
if results['intent']['details']['english_matches']:
print(f" English keywords found: {', '.join(results['intent']['details']['english_matches'])}")
print(f"\n😊 SENTIMENT:")
print(f" {results['sentiment']['type'].upper()}")
print(f" Confidence: {results['sentiment']['confidence']:.1%}")
print("\n" + "="*70)