Spaces:

DataQuests
/

DeepCritical

Running

VibecoderMcSwaggins commited on 11 days ago

Commit

f985224

1 Parent(s): c2f7da2

fix: address CodeRabbit feedback and P0 blockers

Code Fixes (HIGH priority):
- Add API key/provider validation to prevent silent auth failures
- Fix hardcoded manager model in orchestrator_magentic.py (now uses settings.openai_model)
- Add bounds checking to JSON extraction in judges.py (prevents IndexError)
- Fix fragile test assertion in test_judges_hf.py

Code Quality (MEDIUM priority):
- Add explicit type annotation for models_to_try: list[str]
- Fix structured logging (f-string → structured params)
- Align fallback query count (3 queries) between handlers

Test Improvements:
- Add

@pytest
.mark.unit decorator to TestHFInferenceJudgeHandler

Documentation Sync:
- Update Phase 3 docs to match actual implementation:
- __init__ signature (simplified, no inline imports)
- _extract_json (string split with bounds checking)
- _call_with_retry (tenacity decorator, asyncio.get_running_loop())
- assess method (simplified model loop)
- Update Phase 4 docs with ChatInterface additional_inputs for BYOK

All 104 tests pass.

Files changed (6) hide show

docs/implementation/03_phase_judge.md +118 -223
docs/implementation/04_phase_ui.md +34 -10
src/agent_factory/judges.py +13 -6
src/app.py +8 -0
src/orchestrator_magentic.py +1 -1
tests/unit/agent_factory/test_judges_hf.py +4 -3

docs/implementation/03_phase_judge.md CHANGED Viewed

@@ -374,272 +374,167 @@ class HFInferenceJudgeHandler:
         "HuggingFaceH4/zephyr-7b-beta",          # Ungated fallback
     ]
-    def __init__(self, model_id: str | None = None):
         """
         Initialize with HF Inference client.
         Args:
-            model_id: HuggingFace model ID. If None, uses fallback chain.
-                     Will automatically use HF_TOKEN from env if available.
         """
-        from huggingface_hub import InferenceClient
-        import os
-        self.model_id = model_id or self.FALLBACK_MODELS[0]
-        self._fallback_models = self.FALLBACK_MODELS.copy()
-        # InferenceClient auto-reads HF_TOKEN from env
-        self.client = InferenceClient(model=self.model_id)
-        self._has_token = bool(os.getenv("HF_TOKEN"))
         self.call_count = 0
-        self.last_question = None
-        self.last_evidence = None
-        logger.info(
-            "HFInferenceJudgeHandler initialized",
-            model=self.model_id,
-            has_token=self._has_token,
-        )
-    def _extract_json(self, response: str) -> dict | None:
         """
-        Robustly extract JSON from LLM response.
-        Handles:
-        - Raw JSON: {"key": "value"}
-        - Markdown code blocks: ```json\n{"key": "value"}\n```
-        - Preamble text: "Here is the JSON:\n{"key": "value"}"
-        - Nested braces: {"outer": {"inner": "value"}}
-        Returns:
-            Parsed dict or None if extraction fails
         """
-        import json
-        import re
-        # Strategy 1: Try markdown code block first
-        code_block_match = re.search(r"```(?:json)?\s*(\{[\s\S]*?\})\s*```", response)
-        if code_block_match:
-            try:
-                return json.loads(code_block_match.group(1))
-            except json.JSONDecodeError:
-                pass
-        # Strategy 2: Find outermost JSON object with brace matching
-        # This handles nested objects correctly
-        start = response.find("{")
-        if start == -1:
             return None
-        depth = 0
-        end = start
         in_string = False
-        escape_next = False
-        for i, char in enumerate(response[start:], start):
-            if escape_next:
-                escape_next = False
-                continue
-            if char == "\\":
-                escape_next = True
-                continue
-            if char == '"' and not escape_next:
-                in_string = not in_string
-                continue
             if in_string:
-                continue
-            if char == "{":
-                depth += 1
             elif char == "}":
-                depth -= 1
-                if depth == 0:
-                    end = i + 1
-                    break
-        if depth == 0 and end > start:
-            try:
-                return json.loads(response[start:end])
-            except json.JSONDecodeError:
-                pass
         return None
-    async def _call_with_retry(
-        self,
-        messages: list[dict],
-        max_retries: int = 3,
-    ) -> str:
-        """
-        Call HF Inference with exponential backoff retry.
-        Args:
-            messages: Chat messages in OpenAI format
-            max_retries: Max retry attempts
-        Returns:
-            Response text
-        Raises:
-            Exception if all retries fail
-        """
-        import asyncio
-        import time
-        last_error = None
-        for attempt in range(max_retries):
-            try:
-                loop = asyncio.get_event_loop()
-                response = await loop.run_in_executor(
-                    None,
-                    lambda: self.client.chat_completion(
-                        messages=messages,
-                        max_tokens=1024,
-                        temperature=0.1,
-                    )
-                )
-                return response.choices[0].message.content
-            except Exception as e:
-                last_error = e
-                error_str = str(e).lower()
-                # Check if rate limited or service unavailable
-                is_rate_limit = "429" in error_str or "rate" in error_str
-                is_unavailable = "503" in error_str or "unavailable" in error_str
-                is_auth_error = "401" in error_str or "403" in error_str
-                if is_auth_error:
-                    # Gated model without token - try fallback immediately
-                    logger.warning("Auth error, trying fallback model", error=str(e))
-                    if self._try_fallback_model():
-                        continue
-                    raise
-                if is_rate_limit or is_unavailable:
-                    # Exponential backoff: 1s, 2s, 4s
-                    wait_time = 2 ** attempt
-                    logger.warning(
-                        "Rate limited, retrying",
-                        attempt=attempt + 1,
-                        wait=wait_time,
-                        error=str(e),
-                    )
-                    await asyncio.sleep(wait_time)
-                    continue
-                # Other errors - raise immediately
-                raise
-        # All retries failed - try fallback model
-        if self._try_fallback_model():
-            return await self._call_with_retry(messages, max_retries=1)
-        raise last_error or Exception("All retries failed")
-    def _try_fallback_model(self) -> bool:
-        """
-        Try to switch to a fallback model.
-        Returns:
-            True if successfully switched, False if no fallbacks left
-        """
-        from huggingface_hub import InferenceClient
-        # Remove current model from fallbacks
-        if self.model_id in self._fallback_models:
-            self._fallback_models.remove(self.model_id)
-        if not self._fallback_models:
-            return False
-        # Switch to next model
-        self.model_id = self._fallback_models[0]
-        self.client = InferenceClient(model=self.model_id)
-        logger.info("Switched to fallback model", model=self.model_id)
-        return True
     async def assess(
         self,
         question: str,
-        evidence: List[Evidence],
     ) -> JudgeAssessment:
         """
         Assess evidence using HuggingFace Inference API.
-        Uses chat_completion API for model-agnostic prompts.
-        Includes retry logic and fallback model chain.
-        Args:
-            question: The user's research question
-            evidence: List of Evidence objects from search
-        Returns:
-            JudgeAssessment with evaluation results
         """
         self.call_count += 1
         self.last_question = question
         self.last_evidence = evidence
-        # Format the prompt
         if evidence:
             user_prompt = format_user_prompt(question, evidence)
         else:
             user_prompt = format_empty_evidence_prompt(question)
-        # Build messages in OpenAI-compatible format (works with chat_completion)
-        json_schema = """{
-    "details": {
-        "mechanism_score": <int 0-10>,
-        "mechanism_reasoning": "<string>",
-        "clinical_evidence_score": <int 0-10>,
-        "clinical_reasoning": "<string>",
-        "drug_candidates": ["<string>", ...],
-        "key_findings": ["<string>", ...]
-    },
-    "sufficient": <bool>,
-    "confidence": <float 0-1>,
-    "recommendation": "continue" | "synthesize",
-    "next_search_queries": ["<string>", ...],
-    "reasoning": "<string>"
-}"""
-        messages = [
-            {
-                "role": "system",
-                "content": f"{SYSTEM_PROMPT}\n\nIMPORTANT: Respond with ONLY valid JSON matching this schema:\n{json_schema}",
-            },
-            {
-                "role": "user",
-                "content": user_prompt,
-            },
-        ]
-        try:
-            # Call with retry and fallback
-            response = await self._call_with_retry(messages)
-            # Robust JSON extraction
-            data = self._extract_json(response)
-            if data:
-                return JudgeAssessment(**data)
-            # If no valid JSON, return fallback
-            logger.warning(
-                "HF Inference returned invalid JSON",
-                response=response[:200],
-                model=self.model_id,
-            )
-            return self._create_fallback_assessment(question, "Invalid JSON response")
-        except Exception as e:
-            logger.error("HF Inference failed", error=str(e), model=self.model_id)
-            return self._create_fallback_assessment(question, str(e))
     def _create_fallback_assessment(
         self,

         "HuggingFaceH4/zephyr-7b-beta",          # Ungated fallback
     ]
+    def __init__(self, model_id: str | None = None) -> None:
         """
         Initialize with HF Inference client.
         Args:
+            model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
         """
+        self.model_id = model_id
+        # Will automatically use HF_TOKEN from env if available
+        self.client = InferenceClient()
         self.call_count = 0
+        self.last_question: str | None = None
+        self.last_evidence: list[Evidence] | None = None
+    def _extract_json(self, text: str) -> dict[str, Any] | None:
         """
+        Robust JSON extraction that handles markdown blocks and nested braces.
         """
+        text = text.strip()
+        # Remove markdown code blocks if present (with bounds checking)
+        if "```json" in text:
+            parts = text.split("```json", 1)
+            if len(parts) > 1:
+                inner_parts = parts[1].split("```", 1)
+                text = inner_parts[0]
+        elif "```" in text:
+            parts = text.split("```", 1)
+            if len(parts) > 1:
+                inner_parts = parts[1].split("```", 1)
+                text = inner_parts[0]
+        text = text.strip()
+        # Find first '{'
+        start_idx = text.find("{")
+        if start_idx == -1:
             return None
+        # Stack-based parsing ignoring chars in strings
+        count = 0
         in_string = False
+        escape = False
+        for i, char in enumerate(text[start_idx:], start=start_idx):
             if in_string:
+                if escape:
+                    escape = False
+                elif char == "\\":
+                    escape = True
+                elif char == '"':
+                    in_string = False
+            elif char == '"':
+                in_string = True
+            elif char == "{":
+                count += 1
             elif char == "}":
+                count -= 1
+                if count == 0:
+                    try:
+                        result = json.loads(text[start_idx : i + 1])
+                        if isinstance(result, dict):
+                            return result
+                        return None
+                    except json.JSONDecodeError:
+                        return None
         return None
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=1, max=4),
+        retry=retry_if_exception_type(Exception),
+        reraise=True,
+    )
+    async def _call_with_retry(self, model: str, prompt: str, question: str) -> JudgeAssessment:
+        """Make API call with retry logic using chat_completion."""
+        loop = asyncio.get_running_loop()
+        # Build messages for chat_completion (model-agnostic)
+        messages = [
+            {
+                "role": "system",
+                "content": f"""{SYSTEM_PROMPT}
+IMPORTANT: Respond with ONLY valid JSON matching this schema:
+{{
+    "details": {{
+        "mechanism_score": <int 0-10>,
+        "mechanism_reasoning": "<string>",
+        "clinical_evidence_score": <int 0-10>,
+        "clinical_reasoning": "<string>",
+        "drug_candidates": ["<string>", ...],
+        "key_findings": ["<string>", ...]
+    }},
+    "sufficient": <bool>,
+    "confidence": <float 0-1>,
+    "recommendation": "continue" | "synthesize",
+    "next_search_queries": ["<string>", ...],
+    "reasoning": "<string>"
+}}""",
+            },
+            {"role": "user", "content": prompt},
+        ]
+        # Use chat_completion (conversational task - supported by all models)
+        response = await loop.run_in_executor(
+            None,
+            lambda: self.client.chat_completion(
+                messages=messages,
+                model=model,
+                max_tokens=1024,
+                temperature=0.1,
+            ),
+        )
+        # Extract content from response
+        content = response.choices[0].message.content
+        if not content:
+            raise ValueError("Empty response from model")
+        # Extract and parse JSON
+        json_data = self._extract_json(content)
+        if not json_data:
+            raise ValueError("No valid JSON found in response")
+        return JudgeAssessment(**json_data)
     async def assess(
         self,
         question: str,
+        evidence: list[Evidence],
     ) -> JudgeAssessment:
         """
         Assess evidence using HuggingFace Inference API.
+        Attempts models in order until one succeeds.
         """
         self.call_count += 1
         self.last_question = question
         self.last_evidence = evidence
+        # Format the user prompt
         if evidence:
             user_prompt = format_user_prompt(question, evidence)
         else:
             user_prompt = format_empty_evidence_prompt(question)
+        models_to_try: list[str] = [self.model_id] if self.model_id else self.FALLBACK_MODELS
+        last_error: Exception | None = None
+        for model in models_to_try:
+            try:
+                return await self._call_with_retry(model, user_prompt, question)
+            except Exception as e:
+                logger.warning("Model failed", model=model, error=str(e))
+                last_error = e
+                continue
+        # All models failed
+        logger.error("All HF models failed", error=str(last_error))
+        return self._create_fallback_assessment(question, str(last_error))
     def _create_fallback_assessment(
         self,

docs/implementation/04_phase_ui.md CHANGED Viewed

@@ -573,19 +573,43 @@ def create_demo() -> gr.Blocks:
         - "What existing medications show promise for Long COVID?"
         """)
-        chatbot = gr.ChatInterface(
             fn=research_agent,
-            type="messages",
-            title="",
             examples=[
-                "What drugs could be repurposed for Alzheimer's disease?",
-                "Is metformin effective for treating cancer?",
-                "What medications show promise for Long COVID treatment?",
-                "Can statins be repurposed for neurological conditions?",
             ],
-            retry_btn="🔄 Retry",
-            undo_btn="↩️ Undo",
-            clear_btn="🗑️ Clear",
         )
         gr.Markdown("""

         - "What existing medications show promise for Long COVID?"
         """)
+        # Note: additional_inputs render in an accordion below the chat input
+        gr.ChatInterface(
             fn=research_agent,
             examples=[
+                [
+                    "What drugs could be repurposed for Alzheimer's disease?",
+                    "simple",
+                    "",
+                    "openai",
+                ],
+                [
+                    "Is metformin effective for treating cancer?",
+                    "simple",
+                    "",
+                    "openai",
+                ],
+            ],
+            additional_inputs=[
+                gr.Radio(
+                    choices=["simple", "magentic"],
+                    value="simple",
+                    label="Orchestrator Mode",
+                    info="Simple: Linear | Magentic: Multi-Agent (OpenAI)",
+                ),
+                gr.Textbox(
+                    label="API Key (Optional - Bring Your Own Key)",
+                    placeholder="sk-... or sk-ant-...",
+                    type="password",
+                    info="Enter your own API key for full AI analysis. Never stored.",
+                ),
+                gr.Radio(
+                    choices=["openai", "anthropic"],
+                    value="openai",
+                    label="API Provider",
+                    info="Select the provider for your API key",
+                ),
             ],
         )
         gr.Markdown("""

src/agent_factory/judges.py CHANGED Viewed

@@ -195,14 +195,14 @@ class HFInferenceJudgeHandler:
         else:
             user_prompt = format_empty_evidence_prompt(question)
-        models_to_try = [self.model_id] if self.model_id else self.FALLBACK_MODELS
-        last_error = None
         for model in models_to_try:
             try:
                 return await self._call_with_retry(model, user_prompt, question)
             except Exception as e:
-                logger.warning(f"Model {model} failed", error=str(e))
                 last_error = e
                 continue
@@ -275,11 +275,17 @@ IMPORTANT: Respond with ONLY valid JSON matching this schema:
         """
         text = text.strip()
-        # Remove markdown code blocks if present
         if "```json" in text:
-            text = text.split("```json")[1].split("```")[0]
         elif "```" in text:
-            text = text.split("```")[1].split("```")[0]
         text = text.strip()
@@ -339,6 +345,7 @@ IMPORTANT: Respond with ONLY valid JSON matching this schema:
             next_search_queries=[
                 f"{question} mechanism",
                 f"{question} clinical trials",
             ],
             reasoning=f"HF Inference failed: {error}. Recommend configuring OpenAI/Anthropic key.",
         )

         else:
             user_prompt = format_empty_evidence_prompt(question)
+        models_to_try: list[str] = [self.model_id] if self.model_id else self.FALLBACK_MODELS
+        last_error: Exception | None = None
         for model in models_to_try:
             try:
                 return await self._call_with_retry(model, user_prompt, question)
             except Exception as e:
+                logger.warning("Model failed", model=model, error=str(e))
                 last_error = e
                 continue
         """
         text = text.strip()
+        # Remove markdown code blocks if present (with bounds checking)
         if "```json" in text:
+            parts = text.split("```json", 1)
+            if len(parts) > 1:
+                inner_parts = parts[1].split("```", 1)
+                text = inner_parts[0]
         elif "```" in text:
+            parts = text.split("```", 1)
+            if len(parts) > 1:
+                inner_parts = parts[1].split("```", 1)
+                text = inner_parts[0]
         text = text.strip()
             next_search_queries=[
                 f"{question} mechanism",
                 f"{question} clinical trials",
+                f"{question} drug candidates",
             ],
             reasoning=f"HF Inference failed: {error}. Recommend configuring OpenAI/Anthropic key.",
         )

src/app.py CHANGED Viewed

@@ -74,6 +74,14 @@ def configure_orchestrator(
     ):
         model: AnthropicModel | OpenAIModel | None = None
         if user_api_key:
             if api_provider == "anthropic":
                 anthropic_provider = AnthropicProvider(api_key=user_api_key)
                 model = AnthropicModel(settings.anthropic_model, provider=anthropic_provider)

     ):
         model: AnthropicModel | OpenAIModel | None = None
         if user_api_key:
+            # Validate key/provider match to prevent silent auth failures
+            if api_provider == "openai" and user_api_key.startswith("sk-ant-"):
+                raise ValueError("Anthropic key provided but OpenAI provider selected")
+            is_openai_key = user_api_key.startswith("sk-") and not user_api_key.startswith(
+                "sk-ant-"
+            )
+            if api_provider == "anthropic" and is_openai_key:
+                raise ValueError("OpenAI key provided but Anthropic provider selected")
             if api_provider == "anthropic":
                 anthropic_provider = AnthropicProvider(api_key=user_api_key)
                 model = AnthropicModel(settings.anthropic_model, provider=anthropic_provider)

src/orchestrator_magentic.py CHANGED Viewed

@@ -82,7 +82,7 @@ class MagenticOrchestrator:
         # Manager chat client (orchestrates the agents)
         manager_client = OpenAIChatClient(
-            model_id="gpt-4o",  # Good model for planning/coordination
             api_key=settings.openai_api_key,
         )

         # Manager chat client (orchestrates the agents)
         manager_client = OpenAIChatClient(
+            model_id=settings.openai_model,  # Use configured model
             api_key=settings.openai_api_key,
         )

tests/unit/agent_factory/test_judges_hf.py CHANGED Viewed

@@ -8,6 +8,7 @@ from src.agent_factory.judges import HFInferenceJudgeHandler
 from src.utils.models import Citation, Evidence
 class TestHFInferenceJudgeHandler:
     """Tests for HFInferenceJudgeHandler."""
@@ -102,9 +103,9 @@ class TestHFInferenceJudgeHandler:
                 # Should have tried all 3 fallback models
                 assert mock_call.call_count == 3
-                assert result.sufficient is False  # Fallback assessment
-                error_msg = "All HF models failed"
-                assert error_msg in str(mock_call.side_effect) or "failed" in result.reasoning
     def test_extract_json_robustness(self, handler):
         """Test JSON extraction with various inputs."""

 from src.utils.models import Citation, Evidence
+@pytest.mark.unit
 class TestHFInferenceJudgeHandler:
     """Tests for HFInferenceJudgeHandler."""
                 # Should have tried all 3 fallback models
                 assert mock_call.call_count == 3
+                # Fallback assessment should indicate failure
+                assert result.sufficient is False
+                assert "failed" in result.reasoning.lower() or "error" in result.reasoning.lower()
     def test_extract_json_robustness(self, handler):
         """Test JSON extraction with various inputs."""