Spaces:

DataQuests
/

DeepCritical

Running

VibecoderMcSwaggins commited on 11 days ago

Commit

ec3d7dc

1 Parent(s): 2dc022a

docs: comprehensive Phase 6-8 spec revisions (Senior Architect audit)

Phase 6 (Embeddings):
- Add async/executor pattern for all embedding methods
- Add HuggingFace Spaces deployment notes for model caching
- Update tests to be async

Phase 7 (Hypothesis):
- Add text utilities: truncate_at_sentence(), select_diverse_evidence()
- Replace arbitrary evidence[:10] with MMR (Maximal Marginal Relevance)
- Sentence-boundary-aware truncation

Phase 8 (Report) - CRITICAL:
- Add citation validation to prevent LLM hallucination
- Add validate_references() function spec
- Update SYSTEM_PROMPT with strict citation requirements
- Add comprehensive citation validation tests

All changes address issues identified in the "Senior Architect" audit:
- Event loop blocking from sync sentence-transformers
- Arbitrary truncation losing important evidence
- Hallucinated citations in medical research reports

Files changed (3) hide show

docs/implementation/06_phase_embeddings.md +161 -38
docs/implementation/07_phase_hypothesis.md +175 -8
docs/implementation/08_phase_report.md +238 -14

docs/implementation/06_phase_embeddings.md CHANGED Viewed

@@ -82,14 +82,33 @@ embeddings = [
 ### 4.2 Embedding Service (`src/services/embeddings.py`)
 ```python
-"""Embedding service for semantic search."""
 from typing import List
 import chromadb
 from sentence_transformers import SentenceTransformer
 class EmbeddingService:
-    """Handles text embedding and vector storage."""
     def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
         self._model = SentenceTransformer(model_name)
@@ -99,27 +118,67 @@ class EmbeddingService:
             metadata={"hnsw:space": "cosine"}
         )
-    def embed(self, text: str) -> List[float]:
-        """Embed a single text."""
         return self._model.encode(text).tolist()
-    def add_evidence(self, evidence_id: str, content: str, metadata: dict) -> None:
-        """Add evidence to vector store."""
-        embedding = self.embed(content)
-        self._collection.add(
-            ids=[evidence_id],
-            embeddings=[embedding],
-            metadatas=[metadata],
-            documents=[content]
         )
-    def search_similar(self, query: str, n_results: int = 5) -> List[dict]:
-        """Find semantically similar evidence."""
-        query_embedding = self.embed(query)
-        results = self._collection.query(
-            query_embeddings=[query_embedding],
-            n_results=n_results
         )
         return [
             {"id": id, "content": doc, "metadata": meta, "distance": dist}
             for id, doc, meta, dist in zip(
@@ -130,14 +189,14 @@ class EmbeddingService:
             )
         ]
-    def deduplicate(self, new_evidence: List, threshold: float = 0.9) -> List:
-        """Remove semantically duplicate evidence."""
         unique = []
         for evidence in new_evidence:
-            similar = self.search_similar(evidence.content, n_results=1)
             if not similar or similar[0]["distance"] > (1 - threshold):
                 unique.append(evidence)
-                self.add_evidence(
                     evidence_id=evidence.citation.url,
                     content=evidence.content,
                     metadata={"source": evidence.citation.source}
@@ -147,7 +206,7 @@ class EmbeddingService:
 ### 4.3 Enhanced SearchAgent (`src/agents/search_agent.py`)
-Update SearchAgent to use embeddings:
 ```python
 class SearchAgent(BaseAgent):
@@ -166,14 +225,20 @@ class SearchAgent(BaseAgent):
         # Execute keyword search
         result = await self._handler.execute(query, max_results_per_tool=10)
-        # Semantic deduplication (NEW)
         if self._embeddings:
-            unique_evidence = self._embeddings.deduplicate(result.evidence)
-            # Also search for semantically related evidence
-            related = self._embeddings.search_similar(query, n_results=5)
-            # Add related evidence not already in results
-            # ... merge logic ...
         # ... rest of method ...
 ```
@@ -193,6 +258,40 @@ The system has semantic search enabled. When evidence is found:
 """
 ```
 ---
 ## 5. Directory Structure After Phase 6
@@ -214,26 +313,31 @@ src/
 ### 6.1 Unit Tests (`tests/unit/services/test_embeddings.py`)
 ```python
 """Unit tests for EmbeddingService."""
 import pytest
 from src.services.embeddings import EmbeddingService
 class TestEmbeddingService:
-    def test_embed_returns_vector(self):
         """Embedding should return a float vector."""
         service = EmbeddingService()
-        embedding = service.embed("metformin diabetes")
         assert isinstance(embedding, list)
         assert len(embedding) > 0
         assert all(isinstance(x, float) for x in embedding)
-    def test_similar_texts_have_close_embeddings(self):
         """Semantically similar texts should have similar embeddings."""
         service = EmbeddingService()
-        e1 = service.embed("metformin treats diabetes")
-        e2 = service.embed("metformin is used for diabetes treatment")
-        e3 = service.embed("the weather is sunny today")
         # Cosine similarity helper
         from numpy import dot
@@ -243,18 +347,37 @@ class TestEmbeddingService:
         # Similar texts should be closer
         assert cosine(e1, e2) > cosine(e1, e3)
-    def test_add_and_search(self):
         """Should be able to add evidence and search for similar."""
         service = EmbeddingService()
-        service.add_evidence(
             evidence_id="test1",
             content="Metformin activates AMPK pathway",
             metadata={"source": "pubmed"}
         )
-        results = service.search_similar("AMPK activation drugs", n_results=1)
         assert len(results) == 1
         assert "AMPK" in results[0]["content"]
 ```
 ---

 ### 4.2 Embedding Service (`src/services/embeddings.py`)
+> **CRITICAL: Async Pattern Required**
+>
+> `sentence-transformers` is synchronous and CPU-bound. Running it directly in async code
+> will **block the event loop**, freezing the UI and halting all concurrent operations.
+>
+> **Solution**: Use `asyncio.run_in_executor()` to offload to thread pool.
+> This pattern already exists in `src/tools/websearch.py:28-34`.
 ```python
+"""Embedding service for semantic search.
+IMPORTANT: All public methods are async to avoid blocking the event loop.
+The sentence-transformers model is CPU-bound, so we use run_in_executor().
+"""
+import asyncio
 from typing import List
 import chromadb
 from sentence_transformers import SentenceTransformer
 class EmbeddingService:
+    """Handles text embedding and vector storage.
+    All embedding operations run in a thread pool to avoid blocking
+    the async event loop. See src/tools/websearch.py for the pattern.
+    """
     def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
         self._model = SentenceTransformer(model_name)
             metadata={"hnsw:space": "cosine"}
         )
+    # ─────────────────────────────────────────────────────────────────
+    # Sync internal methods (run in thread pool)
+    # ─────────────────────────────────────────────────────────────────
+    def _sync_embed(self, text: str) -> List[float]:
+        """Synchronous embedding - DO NOT call directly from async code."""
         return self._model.encode(text).tolist()
+    def _sync_batch_embed(self, texts: List[str]) -> List[List[float]]:
+        """Batch embedding for efficiency - DO NOT call directly from async code."""
+        return [e.tolist() for e in self._model.encode(texts)]
+    # ─────────────────────────────────────────────────────────────────
+    # Async public methods (safe for event loop)
+    # ─────────────────────────────────────────────────────────────────
+    async def embed(self, text: str) -> List[float]:
+        """Embed a single text (async-safe).
+        Uses run_in_executor to avoid blocking the event loop.
+        """
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self._sync_embed, text)
+    async def embed_batch(self, texts: List[str]) -> List[List[float]]:
+        """Batch embed multiple texts (async-safe, more efficient)."""
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self._sync_batch_embed, texts)
+    async def add_evidence(self, evidence_id: str, content: str, metadata: dict) -> None:
+        """Add evidence to vector store (async-safe)."""
+        embedding = await self.embed(content)
+        # ChromaDB operations are fast, but wrap for consistency
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(
+            None,
+            lambda: self._collection.add(
+                ids=[evidence_id],
+                embeddings=[embedding],
+                metadatas=[metadata],
+                documents=[content]
+            )
         )
+    async def search_similar(self, query: str, n_results: int = 5) -> List[dict]:
+        """Find semantically similar evidence (async-safe)."""
+        query_embedding = await self.embed(query)
+        loop = asyncio.get_running_loop()
+        results = await loop.run_in_executor(
+            None,
+            lambda: self._collection.query(
+                query_embeddings=[query_embedding],
+                n_results=n_results
+            )
         )
+        # Handle empty results gracefully
+        if not results["ids"] or not results["ids"][0]:
+            return []
         return [
             {"id": id, "content": doc, "metadata": meta, "distance": dist}
             for id, doc, meta, dist in zip(
             )
         ]
+    async def deduplicate(self, new_evidence: List, threshold: float = 0.9) -> List:
+        """Remove semantically duplicate evidence (async-safe)."""
         unique = []
         for evidence in new_evidence:
+            similar = await self.search_similar(evidence.content, n_results=1)
             if not similar or similar[0]["distance"] > (1 - threshold):
                 unique.append(evidence)
+                await self.add_evidence(
                     evidence_id=evidence.citation.url,
                     content=evidence.content,
                     metadata={"source": evidence.citation.source}
 ### 4.3 Enhanced SearchAgent (`src/agents/search_agent.py`)
+Update SearchAgent to use embeddings. **Note**: All embedding calls are `await`ed:
 ```python
 class SearchAgent(BaseAgent):
         # Execute keyword search
         result = await self._handler.execute(query, max_results_per_tool=10)
+        # Semantic deduplication (NEW) - ALL CALLS ARE AWAITED
         if self._embeddings:
+            # Deduplicate by semantic similarity (async-safe)
+            unique_evidence = await self._embeddings.deduplicate(result.evidence)
+            # Also search for semantically related evidence (async-safe)
+            related = await self._embeddings.search_similar(query, n_results=5)
+            # Merge related evidence not already in results
+            existing_urls = {e.citation.url for e in unique_evidence}
+            for item in related:
+                if item["id"] not in existing_urls:
+                    # Reconstruct Evidence from stored data
+                    # ... merge logic ...
         # ... rest of method ...
 ```
 """
 ```
+### 4.5 HuggingFace Spaces Deployment
+> **⚠️ Important for HF Spaces**
+>
+> `sentence-transformers` downloads models (~500MB) to `~/.cache` on first use.
+> HuggingFace Spaces have **ephemeral storage** - the cache is wiped on restart.
+> This causes slow cold starts and bandwidth usage.
+**Solution**: Pre-download the model in your Dockerfile:
+```dockerfile
+# In Dockerfile
+FROM python:3.11-slim
+# Set cache directory
+ENV HF_HOME=/app/.cache
+ENV TRANSFORMERS_CACHE=/app/.cache
+# Pre-download the embedding model during build
+RUN pip install sentence-transformers && \
+    python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+# ... rest of Dockerfile
+```
+**Alternative**: Use environment variable to specify persistent path:
+```yaml
+# In HF Spaces settings or app.yaml
+env:
+  - name: HF_HOME
+    value: /data/.cache  # Persistent volume
+```
 ---
 ## 5. Directory Structure After Phase 6
 ### 6.1 Unit Tests (`tests/unit/services/test_embeddings.py`)
+> **Note**: All tests are async since the EmbeddingService methods are async.
 ```python
 """Unit tests for EmbeddingService."""
 import pytest
 from src.services.embeddings import EmbeddingService
 class TestEmbeddingService:
+    @pytest.mark.asyncio
+    async def test_embed_returns_vector(self):
         """Embedding should return a float vector."""
         service = EmbeddingService()
+        embedding = await service.embed("metformin diabetes")
         assert isinstance(embedding, list)
         assert len(embedding) > 0
         assert all(isinstance(x, float) for x in embedding)
+    @pytest.mark.asyncio
+    async def test_similar_texts_have_close_embeddings(self):
         """Semantically similar texts should have similar embeddings."""
         service = EmbeddingService()
+        e1 = await service.embed("metformin treats diabetes")
+        e2 = await service.embed("metformin is used for diabetes treatment")
+        e3 = await service.embed("the weather is sunny today")
         # Cosine similarity helper
         from numpy import dot
         # Similar texts should be closer
         assert cosine(e1, e2) > cosine(e1, e3)
+    @pytest.mark.asyncio
+    async def test_batch_embed_efficient(self):
+        """Batch embedding should be more efficient than individual calls."""
+        service = EmbeddingService()
+        texts = ["text one", "text two", "text three"]
+        # Batch embed
+        batch_results = await service.embed_batch(texts)
+        assert len(batch_results) == 3
+        assert all(isinstance(e, list) for e in batch_results)
+    @pytest.mark.asyncio
+    async def test_add_and_search(self):
         """Should be able to add evidence and search for similar."""
         service = EmbeddingService()
+        await service.add_evidence(
             evidence_id="test1",
             content="Metformin activates AMPK pathway",
             metadata={"source": "pubmed"}
         )
+        results = await service.search_similar("AMPK activation drugs", n_results=1)
         assert len(results) == 1
         assert "AMPK" in results[0]["content"]
+    @pytest.mark.asyncio
+    async def test_search_similar_empty_collection(self):
+        """Search on empty collection should return empty list, not error."""
+        service = EmbeddingService()
+        results = await service.search_similar("anything", n_results=5)
+        assert results == []
 ```
 ---

docs/implementation/07_phase_hypothesis.md CHANGED Viewed

@@ -116,10 +116,150 @@ class HypothesisAssessment(BaseModel):
 ## 4. Implementation
 ### 4.1 Hypothesis Prompts (`src/prompts/hypothesis.py`)
 ```python
 """Prompts for Hypothesis Agent."""
 SYSTEM_PROMPT = """You are a biomedical research scientist specializing in drug repurposing.
@@ -141,16 +281,35 @@ Example hypothesis format:
 Be specific. Use actual gene/protein names when possible."""
-def format_hypothesis_prompt(query: str, evidence: list) -> str:
-    """Format prompt for hypothesis generation."""
     evidence_text = "\n".join([
-        f"- {e.citation.title}: {e.content[:300]}..."
-        for e in evidence[:10]
     ])
     return f"""Based on the following evidence about "{query}", generate mechanistic hypotheses.
-## Evidence
 {evidence_text}
 ## Task
@@ -167,7 +326,7 @@ Generate 2-4 hypotheses, prioritized by confidence."""
 ```python
 """Hypothesis agent for mechanistic reasoning."""
 from collections.abc import AsyncIterable
-from typing import Any
 from agent_framework import (
     AgentRunResponse,
@@ -183,6 +342,9 @@ from src.prompts.hypothesis import SYSTEM_PROMPT, format_hypothesis_prompt
 from src.utils.config import settings
 from src.utils.models import Evidence, HypothesisAssessment
 class HypothesisAgent(BaseAgent):
     """Generates mechanistic hypotheses based on evidence."""
@@ -190,12 +352,14 @@ class HypothesisAgent(BaseAgent):
     def __init__(
         self,
         evidence_store: dict[str, list[Evidence]],
     ) -> None:
         super().__init__(
             name="HypothesisAgent",
             description="Generates scientific hypotheses about drug mechanisms to guide research",
         )
         self._evidence_store = evidence_store
         self._agent = Agent(
             model=settings.llm_provider,  # Uses configured LLM
             output_type=HypothesisAssessment,
@@ -225,8 +389,11 @@ class HypothesisAgent(BaseAgent):
                 response_id="hypothesis-no-evidence",
             )
-        # Generate hypotheses
-        prompt = format_hypothesis_prompt(query, evidence)
         result = await self._agent.run(prompt)
         assessment = result.output

 ## 4. Implementation
+### 4.0 Text Utilities (`src/utils/text_utils.py`)
+> **Why These Utilities?**
+>
+> The original spec used arbitrary truncation (`evidence[:10]` and `content[:300]`).
+> This loses important information randomly. These utilities provide:
+> 1. **Sentence-aware truncation** - cuts at sentence boundaries, not mid-word
+> 2. **Diverse evidence selection** - uses embeddings to select varied evidence (MMR)
+```python
+"""Text processing utilities for evidence handling."""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from src.services.embeddings import EmbeddingService
+    from src.utils.models import Evidence
+def truncate_at_sentence(text: str, max_chars: int = 300) -> str:
+    """Truncate text at sentence boundary, preserving meaning.
+    Args:
+        text: The text to truncate
+        max_chars: Maximum characters (default 300)
+    Returns:
+        Text truncated at last complete sentence within limit
+    """
+    if len(text) <= max_chars:
+        return text
+    # Find truncation point
+    truncated = text[:max_chars]
+    # Look for sentence endings: . ! ? followed by space or end
+    for sep in ['. ', '! ', '? ', '.\n', '!\n', '?\n']:
+        last_sep = truncated.rfind(sep)
+        if last_sep > max_chars // 2:  # Don't truncate too aggressively
+            return text[:last_sep + 1].strip()
+    # Fallback: find last period
+    last_period = truncated.rfind('.')
+    if last_period > max_chars // 2:
+        return text[:last_period + 1].strip()
+    # Last resort: truncate at word boundary
+    last_space = truncated.rfind(' ')
+    if last_space > 0:
+        return text[:last_space].strip() + "..."
+    return truncated + "..."
+async def select_diverse_evidence(
+    evidence: list["Evidence"],
+    n: int,
+    query: str,
+    embeddings: "EmbeddingService | None" = None
+) -> list["Evidence"]:
+    """Select n most diverse and relevant evidence items.
+    Uses Maximal Marginal Relevance (MMR) when embeddings available,
+    falls back to relevance_score sorting otherwise.
+    Args:
+        evidence: All available evidence
+        n: Number of items to select
+        query: Original query for relevance scoring
+        embeddings: Optional EmbeddingService for semantic diversity
+    Returns:
+        Selected evidence items, diverse and relevant
+    """
+    if not evidence:
+        return []
+    if n >= len(evidence):
+        return evidence
+    # Fallback: sort by relevance score if no embeddings
+    if embeddings is None:
+        return sorted(
+            evidence,
+            key=lambda e: e.relevance_score,
+            reverse=True
+        )[:n]
+    # MMR: Maximal Marginal Relevance for diverse selection
+    # Score = λ * relevance - (1-λ) * max_similarity_to_selected
+    lambda_param = 0.7  # Balance relevance vs diversity
+    # Get query embedding
+    query_emb = await embeddings.embed(query)
+    # Get all evidence embeddings
+    evidence_embs = await embeddings.embed_batch([e.content for e in evidence])
+    # Compute relevance scores (cosine similarity to query)
+    from numpy import dot
+    from numpy.linalg import norm
+    cosine = lambda a, b: float(dot(a, b) / (norm(a) * norm(b)))
+    relevance_scores = [cosine(query_emb, emb) for emb in evidence_embs]
+    # Greedy MMR selection
+    selected_indices: list[int] = []
+    remaining = set(range(len(evidence)))
+    for _ in range(n):
+        best_score = float('-inf')
+        best_idx = -1
+        for idx in remaining:
+            # Relevance component
+            relevance = relevance_scores[idx]
+            # Diversity component: max similarity to already selected
+            if selected_indices:
+                max_sim = max(
+                    cosine(evidence_embs[idx], evidence_embs[sel])
+                    for sel in selected_indices
+                )
+            else:
+                max_sim = 0
+            # MMR score
+            mmr_score = lambda_param * relevance - (1 - lambda_param) * max_sim
+            if mmr_score > best_score:
+                best_score = mmr_score
+                best_idx = idx
+        if best_idx >= 0:
+            selected_indices.append(best_idx)
+            remaining.remove(best_idx)
+    return [evidence[i] for i in selected_indices]
+```
 ### 4.1 Hypothesis Prompts (`src/prompts/hypothesis.py`)
 ```python
 """Prompts for Hypothesis Agent."""
+from src.utils.text_utils import truncate_at_sentence, select_diverse_evidence
 SYSTEM_PROMPT = """You are a biomedical research scientist specializing in drug repurposing.
 Be specific. Use actual gene/protein names when possible."""
+async def format_hypothesis_prompt(
+    query: str,
+    evidence: list,
+    embeddings=None
+) -> str:
+    """Format prompt for hypothesis generation.
+    Uses smart evidence selection instead of arbitrary truncation.
+    Args:
+        query: The research query
+        evidence: All collected evidence
+        embeddings: Optional EmbeddingService for diverse selection
+    """
+    # Select diverse, relevant evidence (not arbitrary first 10)
+    selected = await select_diverse_evidence(
+        evidence, n=10, query=query, embeddings=embeddings
+    )
+    # Format with sentence-aware truncation
     evidence_text = "\n".join([
+        f"- **{e.citation.title}** ({e.citation.source}): {truncate_at_sentence(e.content, 300)}"
+        for e in selected
     ])
     return f"""Based on the following evidence about "{query}", generate mechanistic hypotheses.
+## Evidence ({len(selected)} papers selected for diversity)
 {evidence_text}
 ## Task
 ```python
 """Hypothesis agent for mechanistic reasoning."""
 from collections.abc import AsyncIterable
+from typing import TYPE_CHECKING, Any
 from agent_framework import (
     AgentRunResponse,
 from src.utils.config import settings
 from src.utils.models import Evidence, HypothesisAssessment
+if TYPE_CHECKING:
+    from src.services.embeddings import EmbeddingService
 class HypothesisAgent(BaseAgent):
     """Generates mechanistic hypotheses based on evidence."""
     def __init__(
         self,
         evidence_store: dict[str, list[Evidence]],
+        embedding_service: "EmbeddingService | None" = None,  # NEW: for diverse selection
     ) -> None:
         super().__init__(
             name="HypothesisAgent",
             description="Generates scientific hypotheses about drug mechanisms to guide research",
         )
         self._evidence_store = evidence_store
+        self._embeddings = embedding_service  # Used for MMR evidence selection
         self._agent = Agent(
             model=settings.llm_provider,  # Uses configured LLM
             output_type=HypothesisAssessment,
                 response_id="hypothesis-no-evidence",
             )
+        # Generate hypotheses with diverse evidence selection
+        # NOTE: format_hypothesis_prompt is now async
+        prompt = await format_hypothesis_prompt(
+            query, evidence, embeddings=self._embeddings
+        )
         result = await self._agent.run(prompt)
         assessment = result.output

docs/implementation/08_phase_report.md CHANGED Viewed

@@ -190,10 +190,100 @@ class ResearchReport(BaseModel):
 ## 4. Implementation
 ### 4.1 Report Prompts (`src/prompts/report.py`)
 ```python
 """Prompts for Report Agent."""
 SYSTEM_PROMPT = """You are a scientific writer specializing in drug repurposing research reports.
@@ -210,34 +300,66 @@ A good report:
 8. Provides a balanced CONCLUSION
 9. Includes properly formatted REFERENCES
-Write in scientific but accessible language. Be specific about evidence strength."""
-def format_report_prompt(
     query: str,
     evidence: list,
     hypotheses: list,
     assessment: dict,
-    metadata: dict
 ) -> str:
-    """Format prompt for report generation."""
     evidence_summary = "\n".join([
-        f"- [{e.citation.title}]({e.citation.url}): {e.content[:200]}..."
-        for e in evidence[:15]
     ])
     hypotheses_summary = "\n".join([
         f"- {h.drug} → {h.target} → {h.pathway} → {h.effect} (Confidence: {h.confidence:.0%})"
         for h in hypotheses
-    ])
     return f"""Generate a structured research report for the following query.
 ## Original Query
 {query}
-## Evidence Collected ({len(evidence)} papers)
 {evidence_summary}
 ## Hypotheses Generated
@@ -252,7 +374,9 @@ def format_report_prompt(
 - Sources Searched: {', '.join(metadata.get('sources', []))}
 - Search Iterations: {metadata.get('iterations', 0)}
-Generate a complete ResearchReport with all sections filled in."""
 ```
 ### 4.2 Report Agent (`src/agents/report_agent.py`)
@@ -260,7 +384,7 @@ Generate a complete ResearchReport with all sections filled in."""
 ```python
 """Report agent for generating structured research reports."""
 from collections.abc import AsyncIterable
-from typing import Any
 from agent_framework import (
     AgentRunResponse,
@@ -273,9 +397,13 @@ from agent_framework import (
 from pydantic_ai import Agent
 from src.prompts.report import SYSTEM_PROMPT, format_report_prompt
 from src.utils.config import settings
 from src.utils.models import Evidence, MechanismHypothesis, ResearchReport
 class ReportAgent(BaseAgent):
     """Generates structured scientific reports from evidence and hypotheses."""
@@ -283,12 +411,14 @@ class ReportAgent(BaseAgent):
     def __init__(
         self,
         evidence_store: dict[str, list[Evidence]],
     ) -> None:
         super().__init__(
             name="ReportAgent",
             description="Generates structured scientific research reports with citations",
         )
         self._evidence_store = evidence_store
         self._agent = Agent(
             model=settings.llm_provider,
             output_type=ResearchReport,
@@ -325,19 +455,25 @@ class ReportAgent(BaseAgent):
             "iterations": self._evidence_store.get("iteration_count", 0),
         }
-        # Generate report
-        prompt = format_report_prompt(
             query=query,
             evidence=evidence,
             hypotheses=hypotheses,
             assessment=assessment,
-            metadata=metadata
         )
         result = await self._agent.run(prompt)
         report = result.output
-        # Store report
         self._evidence_store["final_report"] = report
         # Return markdown version
@@ -553,6 +689,94 @@ async def test_report_agent_no_evidence():
     response = await agent.run("test query")
     assert "Cannot generate report" in response.messages[0].text
 ```
 ---

 ## 4. Implementation
+### 4.0 Citation Validation (`src/utils/citation_validator.py`)
+> **🚨 CRITICAL: Why Citation Validation?**
+>
+> LLMs frequently **hallucinate** citations - inventing paper titles, authors, and URLs
+> that don't exist. For a medical research tool, fake citations are **dangerous**.
+>
+> This validation layer ensures every reference in the report actually exists
+> in the collected evidence.
+```python
+"""Citation validation to prevent LLM hallucination.
+CRITICAL: Medical research requires accurate citations.
+This module validates that all references exist in collected evidence.
+"""
+import logging
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from src.utils.models import Evidence, ResearchReport
+logger = logging.getLogger(__name__)
+def validate_references(
+    report: "ResearchReport",
+    evidence: list["Evidence"]
+) -> "ResearchReport":
+    """Ensure all references actually exist in collected evidence.
+    CRITICAL: Prevents LLM hallucination of citations.
+    Args:
+        report: The generated research report
+        evidence: All evidence collected during research
+    Returns:
+        Report with only valid references (hallucinated ones removed)
+    """
+    # Build set of valid URLs from evidence
+    valid_urls = {e.citation.url for e in evidence}
+    valid_titles = {e.citation.title.lower() for e in evidence}
+    validated_refs = []
+    removed_count = 0
+    for ref in report.references:
+        ref_url = ref.get("url", "")
+        ref_title = ref.get("title", "").lower()
+        # Check if URL matches collected evidence
+        if ref_url in valid_urls:
+            validated_refs.append(ref)
+        # Fallback: check title match (URLs might differ slightly)
+        elif ref_title and any(ref_title in t or t in ref_title for t in valid_titles):
+            validated_refs.append(ref)
+        else:
+            removed_count += 1
+            logger.warning(
+                f"Removed hallucinated reference: '{ref.get('title', 'Unknown')}' "
+                f"(URL: {ref_url[:50]}...)"
+            )
+    if removed_count > 0:
+        logger.info(
+            f"Citation validation removed {removed_count} hallucinated references. "
+            f"{len(validated_refs)} valid references remain."
+        )
+    # Update report with validated references
+    report.references = validated_refs
+    return report
+def build_reference_from_evidence(evidence: "Evidence") -> dict:
+    """Build a properly formatted reference from evidence.
+    Use this to ensure references match the original evidence exactly.
+    """
+    return {
+        "title": evidence.citation.title,
+        "authors": evidence.citation.authors or ["Unknown"],
+        "source": evidence.citation.source,
+        "date": evidence.citation.date or "n.d.",
+        "url": evidence.citation.url,
+    }
+```
 ### 4.1 Report Prompts (`src/prompts/report.py`)
 ```python
 """Prompts for Report Agent."""
+from src.utils.text_utils import truncate_at_sentence, select_diverse_evidence
 SYSTEM_PROMPT = """You are a scientific writer specializing in drug repurposing research reports.
 8. Provides a balanced CONCLUSION
 9. Includes properly formatted REFERENCES
+Write in scientific but accessible language. Be specific about evidence strength.
+─────────────────────────────────────────────────────────────────────────────
+🚨 CRITICAL CITATION REQUIREMENTS 🚨
+─────────────────────────────────────────────────────────────────────────────
+You MUST follow these rules for the References section:
+1. You may ONLY cite papers that appear in the Evidence section above
+2. Every reference URL must EXACTLY match a provided evidence URL
+3. Do NOT invent, fabricate, or hallucinate any references
+4. Do NOT modify paper titles, authors, dates, or URLs
+5. If unsure about a citation, OMIT it rather than guess
+6. Copy URLs exactly as provided - do not create similar-looking URLs
+VIOLATION OF THESE RULES PRODUCES DANGEROUS MISINFORMATION.
+─────────────────────────────────────────────────────────────────────────────"""
+async def format_report_prompt(
     query: str,
     evidence: list,
     hypotheses: list,
     assessment: dict,
+    metadata: dict,
+    embeddings=None
 ) -> str:
+    """Format prompt for report generation.
+    Includes full evidence details for accurate citation.
+    """
+    # Select diverse evidence (not arbitrary truncation)
+    selected = await select_diverse_evidence(
+        evidence, n=20, query=query, embeddings=embeddings
+    )
+    # Include FULL citation details for each evidence item
+    # This helps the LLM create accurate references
     evidence_summary = "\n".join([
+        f"- **Title**: {e.citation.title}\n"
+        f"  **URL**: {e.citation.url}\n"
+        f"  **Authors**: {', '.join(e.citation.authors or ['Unknown'])}\n"
+        f"  **Date**: {e.citation.date or 'n.d.'}\n"
+        f"  **Source**: {e.citation.source}\n"
+        f"  **Content**: {truncate_at_sentence(e.content, 200)}\n"
+        for e in selected
     ])
     hypotheses_summary = "\n".join([
         f"- {h.drug} → {h.target} → {h.pathway} → {h.effect} (Confidence: {h.confidence:.0%})"
         for h in hypotheses
+    ]) if hypotheses else "No hypotheses generated yet."
     return f"""Generate a structured research report for the following query.
 ## Original Query
 {query}
+## Evidence Collected ({len(selected)} papers, selected for diversity)
 {evidence_summary}
 ## Hypotheses Generated
 - Sources Searched: {', '.join(metadata.get('sources', []))}
 - Search Iterations: {metadata.get('iterations', 0)}
+Generate a complete ResearchReport with all sections filled in.
+REMINDER: Only cite papers from the Evidence section above. Copy URLs exactly."""
 ```
 ### 4.2 Report Agent (`src/agents/report_agent.py`)
 ```python
 """Report agent for generating structured research reports."""
 from collections.abc import AsyncIterable
+from typing import TYPE_CHECKING, Any
 from agent_framework import (
     AgentRunResponse,
 from pydantic_ai import Agent
 from src.prompts.report import SYSTEM_PROMPT, format_report_prompt
+from src.utils.citation_validator import validate_references  # CRITICAL
 from src.utils.config import settings
 from src.utils.models import Evidence, MechanismHypothesis, ResearchReport
+if TYPE_CHECKING:
+    from src.services.embeddings import EmbeddingService
 class ReportAgent(BaseAgent):
     """Generates structured scientific reports from evidence and hypotheses."""
     def __init__(
         self,
         evidence_store: dict[str, list[Evidence]],
+        embedding_service: "EmbeddingService | None" = None,  # For diverse selection
     ) -> None:
         super().__init__(
             name="ReportAgent",
             description="Generates structured scientific research reports with citations",
         )
         self._evidence_store = evidence_store
+        self._embeddings = embedding_service
         self._agent = Agent(
             model=settings.llm_provider,
             output_type=ResearchReport,
             "iterations": self._evidence_store.get("iteration_count", 0),
         }
+        # Generate report (format_report_prompt is now async)
+        prompt = await format_report_prompt(
             query=query,
             evidence=evidence,
             hypotheses=hypotheses,
             assessment=assessment,
+            metadata=metadata,
+            embeddings=self._embeddings,
         )
         result = await self._agent.run(prompt)
         report = result.output
+        # ═══════════════════════════════════════════════════════════════════
+        # 🚨 CRITICAL: Validate citations to prevent hallucination
+        # ═══════════════════════════════════════════════════════════════════
+        report = validate_references(report, evidence)
+        # Store validated report
         self._evidence_store["final_report"] = report
         # Return markdown version
     response = await agent.run("test query")
     assert "Cannot generate report" in response.messages[0].text
+# ═══════════════════════════════════════════════════════════════════════════
+# 🚨 CRITICAL: Citation Validation Tests
+# ��══════════════════════════════════════════════════════════════════════════
+@pytest.mark.asyncio
+async def test_report_agent_removes_hallucinated_citations(sample_evidence):
+    """ReportAgent should remove citations not in evidence."""
+    from src.utils.citation_validator import validate_references
+    # Create report with mix of valid and hallucinated references
+    report_with_hallucinations = ResearchReport(
+        title="Test Report",
+        executive_summary="This is a test report for citation validation...",
+        research_question="Testing citation validation",
+        methodology=ReportSection(title="Methodology", content="Test"),
+        hypotheses_tested=[],
+        mechanistic_findings=ReportSection(title="Mechanistic", content="Test"),
+        clinical_findings=ReportSection(title="Clinical", content="Test"),
+        drug_candidates=["TestDrug"],
+        limitations=["Test limitation"],
+        conclusion="Test conclusion",
+        references=[
+            # Valid reference (matches sample_evidence)
+            {
+                "title": "Metformin mechanisms",
+                "url": "https://pubmed.ncbi.nlm.nih.gov/12345/",
+                "authors": ["Smith J", "Jones A"],
+                "date": "2023",
+                "source": "pubmed"
+            },
+            # HALLUCINATED reference (URL doesn't exist in evidence)
+            {
+                "title": "Fake Paper That Doesn't Exist",
+                "url": "https://fake-journal.com/made-up-paper",
+                "authors": ["Hallucinated A"],
+                "date": "2024",
+                "source": "fake"
+            },
+            # Another HALLUCINATED reference
+            {
+                "title": "Invented Research",
+                "url": "https://pubmed.ncbi.nlm.nih.gov/99999999/",
+                "authors": ["NotReal B"],
+                "date": "2025",
+                "source": "pubmed"
+            }
+        ],
+        sources_searched=["pubmed"],
+        total_papers_reviewed=1,
+        search_iterations=1,
+        confidence_score=0.5
+    )
+    # Validate - should remove hallucinated references
+    validated_report = validate_references(report_with_hallucinations, sample_evidence)
+    # Only the valid reference should remain
+    assert len(validated_report.references) == 1
+    assert validated_report.references[0]["title"] == "Metformin mechanisms"
+    assert "Fake Paper" not in str(validated_report.references)
+def test_citation_validator_handles_empty_references():
+    """Citation validator should handle reports with no references."""
+    from src.utils.citation_validator import validate_references
+    report = ResearchReport(
+        title="Empty Refs Report",
+        executive_summary="This report has no references...",
+        research_question="Testing empty refs",
+        methodology=ReportSection(title="Methodology", content="Test"),
+        hypotheses_tested=[],
+        mechanistic_findings=ReportSection(title="Mechanistic", content="Test"),
+        clinical_findings=ReportSection(title="Clinical", content="Test"),
+        drug_candidates=[],
+        limitations=[],
+        conclusion="Test",
+        references=[],  # Empty!
+        sources_searched=[],
+        total_papers_reviewed=0,
+        search_iterations=0,
+        confidence_score=0.0
+    )
+    validated = validate_references(report, [])
+    assert validated.references == []
 ```
 ---