""" Structured output parsing using LlamaIndex Pydantic Programs. Ensures consistent image formatting in agent responses. HACKATHON OPTIMIZED: Uses regex extraction instead of LLM calls for speed. """ from typing import List, Optional import re from pydantic import BaseModel, Field class BirdIdentificationResponse(BaseModel): """Structured response for bird identification using LlamaIndex Pydantic.""" summary: str = Field( description="Main response text with bird identification, facts, or information" ) species_name: Optional[str] = Field( default=None, description="Common name of the bird species (e.g., 'Northern Cardinal')" ) image_urls: List[str] = Field( default_factory=list, description="List of image URLs to display for this bird" ) audio_urls: List[str] = Field( default_factory=list, description="List of audio URLs (bird calls/songs)" ) confidence_score: Optional[float] = Field( default=None, description="Confidence score from classifier (0.0-1.0)" ) def extract_urls_from_text(text: str) -> tuple[List[str], List[str]]: """ Extract image and audio URLs from text using regex. Updated to handle URLs within markdown, JSON, and plain text. Supports both extension-based URLs (.jpg, .png) and domain-based (Unsplash). Returns: tuple: (image_urls, audio_urls) """ # Pattern 1: Image URLs with file extensions # Matches URLs ending in image extensions, allowing most characters before the extension # Stops at whitespace or common delimiters like ), ], } image_pattern_ext = r'https?://[^\s)}\]]+?\.(?:jpg|jpeg|png|gif|webp|svg)(?:\?[^\s)}\]]*)?' # Pattern 2: Unsplash image URLs (no file extension needed) # Matches: https://images.unsplash.com/photo-XXXXXXX or similar image_pattern_unsplash = r'https?://images\.unsplash\.com/[^\s)}\]]*' # Pattern for audio URLs - handles both direct audio files AND xeno-canto links # Updated to be more permissive like image pattern audio_pattern_files = r'https?://[^\s)}\]]+?\.(?:mp3|wav|ogg|m4a)(?:\?[^\s)}\]]*)?' audio_pattern_xenocanto = r'https?://xeno-canto\.org/\d+(?:/download)?' print(f"[EXTRACT_URLS] Searching text of length {len(text)}") # Extract all URLs - combine both image patterns raw_image_urls_ext = re.findall(image_pattern_ext, text, re.IGNORECASE) raw_image_urls_unsplash = re.findall(image_pattern_unsplash, text, re.IGNORECASE) raw_audio_urls_files = re.findall(audio_pattern_files, text, re.IGNORECASE) audio_urls_xenocanto = list(set(re.findall(audio_pattern_xenocanto, text, re.IGNORECASE))) # Combine image URLs from both patterns raw_image_urls = raw_image_urls_ext + raw_image_urls_unsplash print(f"[EXTRACT_URLS] Found {len(raw_image_urls_ext)} extension-based image URLs") print(f"[EXTRACT_URLS] Found {len(raw_image_urls_unsplash)} Unsplash image URLs") print(f"[EXTRACT_URLS] Found {len(raw_audio_urls_files)} audio file URLs") print(f"[EXTRACT_URLS] Found {len(audio_urls_xenocanto)} xeno-canto URLs") # Clean URLs (remove trailing quotes, commas, etc.) def clean_url(url: str) -> str: cleaned = url.rstrip('",;)') # Validate it's still a proper URL if cleaned.startswith('http://') or cleaned.startswith('https://'): return cleaned else: print(f"[EXTRACT_URLS] ⚠️ Rejected malformed URL after cleaning: {cleaned}") return None image_urls = [u for u in (clean_url(url) for url in raw_image_urls) if u is not None] image_urls = list(set(image_urls)) # Deduplicate audio_urls_files = [u for u in (clean_url(url) for url in raw_audio_urls_files) if u is not None] audio_urls_files = list(set(audio_urls_files)) # Deduplicate # Combine both types of audio URLs audio_urls = audio_urls_files + audio_urls_xenocanto # Log the actual URLs extracted print(f"[EXTRACT_URLS] ✅ Cleaned image URLs ({len(image_urls)}): {image_urls}") print(f"[EXTRACT_URLS] ✅ Cleaned audio URLs ({len(audio_urls)}): {audio_urls}") return image_urls, audio_urls def extract_species_name(text: str) -> Optional[str]: """ Try to extract species name from common patterns in response. """ # Pattern: "identified as SPECIES NAME" or "species: SPECIES NAME" patterns = [ r'identified as[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})', r'species[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})', r'This is (?:a |an )?([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})', ] for pattern in patterns: match = re.search(pattern, text) if match: return match.group(1) return None async def parse_agent_response( raw_response: str, provider: str, api_key: str, model: str ) -> str: """ Parse agent response into structured format and reformat with guaranteed markdown. OPTIMIZED FOR HACKATHON: Uses regex extraction instead of LLM call. Still uses LlamaIndex Pydantic models for structured data. Args: raw_response: The agent's raw text response provider: LLM provider ("openai", "anthropic", "huggingface") api_key: API key (unused in optimized version) model: Model name (unused in optimized version) Returns: Formatted markdown response with guaranteed image syntax """ try: print("[STRUCTURED OUTPUT] Starting parsing...") print(f"[STRUCTURED OUTPUT] Raw response length: {len(raw_response)} characters") print(f"[STRUCTURED OUTPUT] First 500 chars: {raw_response[:500]}") print(f"[STRUCTURED OUTPUT] Last 500 chars: {raw_response[-500:]}") # Extract URLs using regex (fast, no API call) image_urls, audio_urls = extract_urls_from_text(raw_response) print(f"[STRUCTURED OUTPUT] Found {len(image_urls)} images, {len(audio_urls)} audio files") # Extract species name if possible species_name = extract_species_name(raw_response) # Create structured response using LlamaIndex Pydantic model structured = BirdIdentificationResponse( summary=raw_response, # Keep full response as summary species_name=species_name, image_urls=image_urls, audio_urls=audio_urls, confidence_score=None # Could extract with regex if needed ) # Check if we found any media to format if not structured.image_urls and not structured.audio_urls: print("[STRUCTURED OUTPUT] No images or audio found, returning original") return raw_response # Reformat into markdown with guaranteed images formatted_parts = [] # Main summary (but remove already-formatted images/audio to avoid duplication) clean_summary = raw_response for url in image_urls: # Remove existing markdown images clean_summary = re.sub(rf'!\[([^\]]*)\]\({re.escape(url)}\)', '', clean_summary) # Remove plain URLs clean_summary = clean_summary.replace(url, '') for url in audio_urls: # Remove audio URLs from summary clean_summary = clean_summary.replace(url, '') formatted_parts.append(clean_summary.strip()) # Add images with markdown syntax if structured.image_urls: formatted_parts.append("\n### Images\n") for idx, url in enumerate(structured.image_urls, 1): # Use species name if available, otherwise generic alt_text = structured.species_name or f"Bird {idx}" img_markdown = f"![{alt_text}]({url})" print(f"[STRUCTURED OUTPUT] Generated image markdown: {img_markdown}") formatted_parts.append(img_markdown) # Add audio links if present if structured.audio_urls: formatted_parts.append("\n### Audio Recordings\n") for idx, url in enumerate(structured.audio_urls, 1): # Strip /download from xeno-canto URLs for browser-friendly links display_url = url.replace("/download", "") if "xeno-canto.org" in url else url formatted_parts.append(f"🔊 [Listen to recording {idx}]({display_url})") result = "\n\n".join(formatted_parts) print(f"[STRUCTURED OUTPUT] ✅ Successfully formatted response") print(f"[STRUCTURED OUTPUT] Final markdown length: {len(result)} characters") print(f"[STRUCTURED OUTPUT] Final markdown (last 500 chars): {result[-500:]}") return result except Exception as e: # Fallback: return original response if parsing fails print(f"[STRUCTURED OUTPUT] ❌ Parsing failed: {e}") return raw_response