318 lines
13 KiB
Python
318 lines
13 KiB
Python
import whisper
|
||
import torch
|
||
import argparse
|
||
import os
|
||
from typing import Dict, Tuple, Optional
|
||
import warnings
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
class UrduIntentExtractor:
|
||
def __init__(self, model_size: str = "large-v3"):
|
||
"""
|
||
Initialize Urdu intent extractor using Whisper
|
||
|
||
Args:
|
||
model_size: Whisper model size (tiny, base, small, medium, large)
|
||
"""
|
||
print(f"Loading Whisper {model_size} model...")
|
||
self.model = whisper.load_model(model_size)
|
||
|
||
# Comprehensive intent mapping for Urdu and English
|
||
self.intent_keywords = {
|
||
"greeting": {
|
||
"urdu": ["سلام", "السلام علیکم", "ہیلو", "آداب", "صبح بخیر", "شام بخیر"],
|
||
"english": ["hello", "hi", "greetings", "good morning", "good evening", "assalam"]
|
||
},
|
||
"question": {
|
||
"urdu": ["کیا", "کب", "کیوں", "کسے", "کہاں", "کس طرح", "کتنا", "کیسے"],
|
||
"english": ["what", "when", "why", "who", "where", "how", "how much", "which"]
|
||
},
|
||
"request": {
|
||
"urdu": ["براہ کرم", "مہربانی", "چاہتا ہوں", "چاہتی ہوں", "درکار ہے", "مدد چاہیے"],
|
||
"english": ["please", "kindly", "want", "need", "require", "help", "could you", "would you"]
|
||
},
|
||
"command": {
|
||
"urdu": ["کرو", "کریں", "لاؤ", "دیں", "بناؤ", "روکو", "جاؤ", "آؤ"],
|
||
"english": ["do", "make", "bring", "give", "create", "stop", "go", "come"]
|
||
},
|
||
"complaint": {
|
||
"urdu": ["شکایت", "مسئلہ", "پریشانی", "غلط", "خراب", "نقص", "برا"],
|
||
"english": ["complaint", "problem", "issue", "wrong", "bad", "fault", "error"]
|
||
},
|
||
"information": {
|
||
"urdu": ["بتائیں", "جانیں", "معلوم", "تفصیل", "رہنمائی", "بتاؤ"],
|
||
"english": ["tell", "know", "information", "details", "guide", "explain"]
|
||
},
|
||
"emergency": {
|
||
"urdu": ["حادثہ", "ایمرجنسی", "تباہی", "بچاؤ", "جلدی", "فوری", "خطرہ"],
|
||
"english": ["accident", "emergency", "help", "urgent", "quick", "danger", "dangerous"]
|
||
},
|
||
"appointment": {
|
||
"urdu": ["ملاقات", "اپائنٹمنٹ", "ٹائم", "تاریخ", "وقت", "دن"],
|
||
"english": ["meeting", "appointment", "time", "date", "schedule", "day"]
|
||
},
|
||
"farewell": {
|
||
"urdu": ["اللہ حافظ", "خدا حافظ", "بای", "اختتام", "ختم", "اگلی بار"],
|
||
"english": ["goodbye", "bye", "farewell", "end", "see you", "next time"]
|
||
},
|
||
"thanks": {
|
||
"urdu": ["شکریہ", "مہربانی", "آپ کا بہت شکریہ", "تھینکس"],
|
||
"english": ["thank", "thanks", "grateful", "appreciate"]
|
||
}
|
||
}
|
||
|
||
def transcribe_and_translate(self, audio_path: str) -> Dict[str, str]:
|
||
"""
|
||
Transcribe Urdu audio and translate to English using Whisper
|
||
|
||
Args:
|
||
audio_path: Path to audio file
|
||
|
||
Returns:
|
||
Dictionary containing Urdu transcription and English translation
|
||
"""
|
||
print(f"\nProcessing audio file: {os.path.basename(audio_path)}")
|
||
|
||
# First, transcribe in Urdu
|
||
print("Transcribing in Urdu...")
|
||
urdu_result = self.model.transcribe(
|
||
audio_path,
|
||
language="ur", # Force Urdu language
|
||
task="transcribe",
|
||
fp16=torch.cuda.is_available()
|
||
)
|
||
urdu_text = urdu_result["text"].strip()
|
||
|
||
# Then, translate to English
|
||
print("Translating to English...")
|
||
english_result = self.model.transcribe(
|
||
audio_path,
|
||
language="ur", # Source language is Urdu
|
||
task="translate", # This tells Whisper to translate
|
||
fp16=torch.cuda.is_available()
|
||
)
|
||
english_text = english_result["text"].strip()
|
||
|
||
return {
|
||
"urdu": urdu_text,
|
||
"english": english_text,
|
||
"urdu_segments": urdu_result.get("segments", []),
|
||
"english_segments": english_result.get("segments", [])
|
||
}
|
||
|
||
def extract_intent(self, urdu_text: str, english_text: str) -> Tuple[str, float, Dict]:
|
||
"""
|
||
Extract main intent from both Urdu and English texts
|
||
|
||
Args:
|
||
urdu_text: Original Urdu transcription
|
||
english_text: Translated English text
|
||
|
||
Returns:
|
||
Tuple of (intent, confidence, details)
|
||
"""
|
||
print("\nAnalyzing intent...")
|
||
|
||
# Prepare text for analysis
|
||
urdu_lower = urdu_text.lower()
|
||
english_lower = english_text.lower()
|
||
|
||
# Calculate intent scores
|
||
intent_scores = {}
|
||
intent_details = {}
|
||
|
||
for intent, keywords in self.intent_keywords.items():
|
||
# Count Urdu keyword matches
|
||
urdu_matches = []
|
||
for keyword in keywords["urdu"]:
|
||
if keyword in urdu_lower:
|
||
urdu_matches.append(keyword)
|
||
|
||
# Count English keyword matches
|
||
english_matches = []
|
||
for keyword in keywords["english"]:
|
||
if keyword.lower() in english_lower:
|
||
english_matches.append(keyword)
|
||
|
||
# Calculate scores
|
||
urdu_score = len(urdu_matches)
|
||
english_score = len(english_matches)
|
||
total_score = urdu_score + english_score
|
||
|
||
if total_score > 0:
|
||
intent_scores[intent] = total_score
|
||
intent_details[intent] = {
|
||
"urdu_matches": urdu_matches,
|
||
"english_matches": english_matches,
|
||
"urdu_score": urdu_score,
|
||
"english_score": english_score,
|
||
"total_score": total_score
|
||
}
|
||
|
||
# Determine main intent
|
||
if intent_scores:
|
||
# Get intent with highest score
|
||
main_intent = max(intent_scores, key=intent_scores.get)
|
||
|
||
# Calculate confidence based on multiple factors
|
||
total_words = len(english_lower.split()) + len(urdu_lower.split())
|
||
base_confidence = intent_scores[main_intent] / max(1, total_words / 5)
|
||
|
||
# Boost confidence if matches found in both languages
|
||
if (intent_details[main_intent]["urdu_score"] > 0 and
|
||
intent_details[main_intent]["english_score"] > 0):
|
||
base_confidence *= 1.5
|
||
|
||
confidence = min(base_confidence, 1.0)
|
||
else:
|
||
main_intent = "general_conversation"
|
||
confidence = 0.3
|
||
intent_details[main_intent] = {
|
||
"urdu_matches": [],
|
||
"english_matches": [],
|
||
"urdu_score": 0,
|
||
"english_score": 0,
|
||
"total_score": 0
|
||
}
|
||
|
||
return main_intent, confidence, intent_details[main_intent]
|
||
|
||
def get_intent_description(self, intent: str) -> str:
|
||
"""
|
||
Get human-readable description for intent
|
||
|
||
Args:
|
||
intent: Detected intent
|
||
|
||
Returns:
|
||
Description string
|
||
"""
|
||
descriptions = {
|
||
"greeting": "👋 Greeting or starting a conversation",
|
||
"question": "❓ Asking a question or seeking clarification",
|
||
"request": "🙏 Making a request or asking for something",
|
||
"command": "⚡ Giving a command or instruction",
|
||
"complaint": "😠 Expressing a complaint or dissatisfaction",
|
||
"information": "ℹ️ Seeking or providing information",
|
||
"emergency": "🚨 Emergency situation requiring immediate attention",
|
||
"appointment": "📅 Scheduling or inquiring about a meeting/appointment",
|
||
"farewell": "👋 Ending the conversation",
|
||
"thanks": "🙏 Expressing gratitude or thanks",
|
||
"general_conversation": "💬 General conversation without specific intent"
|
||
}
|
||
return descriptions.get(intent, "💭 Unknown or general conversation")
|
||
|
||
def analyze_sentiment(self, english_text: str) -> Tuple[str, float]:
|
||
"""
|
||
Basic sentiment analysis based on keywords
|
||
|
||
Args:
|
||
english_text: English translated text
|
||
|
||
Returns:
|
||
Tuple of (sentiment, confidence)
|
||
"""
|
||
positive_words = ["good", "great", "excellent", "happy", "thanks", "thank", "please",
|
||
"wonderful", "nice", "helpful", "appreciate", "love", "like"]
|
||
negative_words = ["bad", "wrong", "problem", "issue", "complaint", "angry", "upset",
|
||
"terrible", "horrible", "hate", "not working", "broken", "failed"]
|
||
|
||
text_lower = english_text.lower()
|
||
|
||
positive_count = sum(1 for word in positive_words if word in text_lower)
|
||
negative_count = sum(1 for word in negative_words if word in text_lower)
|
||
|
||
if positive_count > negative_count:
|
||
return "positive", positive_count / max(1, (positive_count + negative_count))
|
||
elif negative_count > positive_count:
|
||
return "negative", negative_count / max(1, (positive_count + negative_count))
|
||
else:
|
||
return "neutral", 0.5
|
||
|
||
def process_audio_file(self, audio_path: str, verbose: bool = True) -> Dict:
|
||
"""
|
||
Main function to process audio file and extract intent
|
||
|
||
Args:
|
||
audio_path: Path to audio file
|
||
verbose: Whether to print detailed output
|
||
|
||
Returns:
|
||
Dictionary with all analysis results
|
||
"""
|
||
# Validate file
|
||
if not os.path.exists(audio_path):
|
||
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
||
|
||
# Transcribe and translate
|
||
results = self.transcribe_and_translate(audio_path)
|
||
|
||
# Extract intent
|
||
intent, confidence, intent_details = self.extract_intent(
|
||
results["urdu"],
|
||
results["english"]
|
||
)
|
||
|
||
# Analyze sentiment
|
||
sentiment, sentiment_confidence = self.analyze_sentiment(results["english"])
|
||
|
||
# Prepare final results
|
||
final_results = {
|
||
"file": os.path.basename(audio_path),
|
||
"transcription": {
|
||
"urdu": results["urdu"],
|
||
"english": results["english"]
|
||
},
|
||
"intent": {
|
||
"type": intent,
|
||
"confidence": confidence,
|
||
"description": self.get_intent_description(intent),
|
||
"details": intent_details
|
||
},
|
||
"sentiment": {
|
||
"type": sentiment,
|
||
"confidence": sentiment_confidence
|
||
},
|
||
"segments": {
|
||
"urdu": results.get("urdu_segments", []),
|
||
"english": results.get("english_segments", [])
|
||
}
|
||
}
|
||
|
||
# Print results if verbose
|
||
if verbose:
|
||
self.print_results(final_results)
|
||
|
||
return final_results
|
||
|
||
def print_results(self, results: Dict):
|
||
"""
|
||
Print analysis results in a formatted way
|
||
"""
|
||
print("\n" + "="*70)
|
||
print("URDU SPEECH INTENT ANALYSIS RESULTS")
|
||
print("="*70)
|
||
|
||
print(f"\n📁 File: {results['file']}")
|
||
|
||
print(f"\n🗣️ URDU TRANSCRIPTION:")
|
||
print(f" {results['transcription']['urdu']}")
|
||
|
||
print(f"\n🌐 ENGLISH TRANSLATION:")
|
||
print(f" {results['transcription']['english']}")
|
||
|
||
print(f"\n🎯 DETECTED INTENT:")
|
||
print(f" {results['intent']['description']}")
|
||
print(f" Confidence: {results['intent']['confidence']:.1%}")
|
||
|
||
if results['intent']['details']['urdu_matches']:
|
||
print(f" Urdu keywords found: {', '.join(results['intent']['details']['urdu_matches'])}")
|
||
if results['intent']['details']['english_matches']:
|
||
print(f" English keywords found: {', '.join(results['intent']['details']['english_matches'])}")
|
||
|
||
print(f"\n😊 SENTIMENT:")
|
||
print(f" {results['sentiment']['type'].upper()}")
|
||
print(f" Confidence: {results['sentiment']['confidence']:.1%}")
|
||
|
||
print("\n" + "="*70) |