VibecoderMcSwaggins commited on
Commit
ec3d7dc
Β·
1 Parent(s): 2dc022a

docs: comprehensive Phase 6-8 spec revisions (Senior Architect audit)

Browse files

Phase 6 (Embeddings):
- Add async/executor pattern for all embedding methods
- Add HuggingFace Spaces deployment notes for model caching
- Update tests to be async

Phase 7 (Hypothesis):
- Add text utilities: truncate_at_sentence(), select_diverse_evidence()
- Replace arbitrary evidence[:10] with MMR (Maximal Marginal Relevance)
- Sentence-boundary-aware truncation

Phase 8 (Report) - CRITICAL:
- Add citation validation to prevent LLM hallucination
- Add validate_references() function spec
- Update SYSTEM_PROMPT with strict citation requirements
- Add comprehensive citation validation tests

All changes address issues identified in the "Senior Architect" audit:
- Event loop blocking from sync sentence-transformers
- Arbitrary truncation losing important evidence
- Hallucinated citations in medical research reports

docs/implementation/06_phase_embeddings.md CHANGED
@@ -82,14 +82,33 @@ embeddings = [
82
 
83
  ### 4.2 Embedding Service (`src/services/embeddings.py`)
84
 
 
 
 
 
 
 
 
 
85
  ```python
86
- """Embedding service for semantic search."""
 
 
 
 
 
87
  from typing import List
 
88
  import chromadb
89
  from sentence_transformers import SentenceTransformer
90
 
 
91
  class EmbeddingService:
92
- """Handles text embedding and vector storage."""
 
 
 
 
93
 
94
  def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
95
  self._model = SentenceTransformer(model_name)
@@ -99,27 +118,67 @@ class EmbeddingService:
99
  metadata={"hnsw:space": "cosine"}
100
  )
101
 
102
- def embed(self, text: str) -> List[float]:
103
- """Embed a single text."""
 
 
 
 
104
  return self._model.encode(text).tolist()
105
 
106
- def add_evidence(self, evidence_id: str, content: str, metadata: dict) -> None:
107
- """Add evidence to vector store."""
108
- embedding = self.embed(content)
109
- self._collection.add(
110
- ids=[evidence_id],
111
- embeddings=[embedding],
112
- metadatas=[metadata],
113
- documents=[content]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  )
115
 
116
- def search_similar(self, query: str, n_results: int = 5) -> List[dict]:
117
- """Find semantically similar evidence."""
118
- query_embedding = self.embed(query)
119
- results = self._collection.query(
120
- query_embeddings=[query_embedding],
121
- n_results=n_results
 
 
 
 
 
122
  )
 
 
 
 
 
123
  return [
124
  {"id": id, "content": doc, "metadata": meta, "distance": dist}
125
  for id, doc, meta, dist in zip(
@@ -130,14 +189,14 @@ class EmbeddingService:
130
  )
131
  ]
132
 
133
- def deduplicate(self, new_evidence: List, threshold: float = 0.9) -> List:
134
- """Remove semantically duplicate evidence."""
135
  unique = []
136
  for evidence in new_evidence:
137
- similar = self.search_similar(evidence.content, n_results=1)
138
  if not similar or similar[0]["distance"] > (1 - threshold):
139
  unique.append(evidence)
140
- self.add_evidence(
141
  evidence_id=evidence.citation.url,
142
  content=evidence.content,
143
  metadata={"source": evidence.citation.source}
@@ -147,7 +206,7 @@ class EmbeddingService:
147
 
148
  ### 4.3 Enhanced SearchAgent (`src/agents/search_agent.py`)
149
 
150
- Update SearchAgent to use embeddings:
151
 
152
  ```python
153
  class SearchAgent(BaseAgent):
@@ -166,14 +225,20 @@ class SearchAgent(BaseAgent):
166
  # Execute keyword search
167
  result = await self._handler.execute(query, max_results_per_tool=10)
168
 
169
- # Semantic deduplication (NEW)
170
  if self._embeddings:
171
- unique_evidence = self._embeddings.deduplicate(result.evidence)
 
172
 
173
- # Also search for semantically related evidence
174
- related = self._embeddings.search_similar(query, n_results=5)
175
- # Add related evidence not already in results
176
- # ... merge logic ...
 
 
 
 
 
177
 
178
  # ... rest of method ...
179
  ```
@@ -193,6 +258,40 @@ The system has semantic search enabled. When evidence is found:
193
  """
194
  ```
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  ---
197
 
198
  ## 5. Directory Structure After Phase 6
@@ -214,26 +313,31 @@ src/
214
 
215
  ### 6.1 Unit Tests (`tests/unit/services/test_embeddings.py`)
216
 
 
 
217
  ```python
218
  """Unit tests for EmbeddingService."""
219
  import pytest
220
  from src.services.embeddings import EmbeddingService
221
 
 
222
  class TestEmbeddingService:
223
- def test_embed_returns_vector(self):
 
224
  """Embedding should return a float vector."""
225
  service = EmbeddingService()
226
- embedding = service.embed("metformin diabetes")
227
  assert isinstance(embedding, list)
228
  assert len(embedding) > 0
229
  assert all(isinstance(x, float) for x in embedding)
230
 
231
- def test_similar_texts_have_close_embeddings(self):
 
232
  """Semantically similar texts should have similar embeddings."""
233
  service = EmbeddingService()
234
- e1 = service.embed("metformin treats diabetes")
235
- e2 = service.embed("metformin is used for diabetes treatment")
236
- e3 = service.embed("the weather is sunny today")
237
 
238
  # Cosine similarity helper
239
  from numpy import dot
@@ -243,18 +347,37 @@ class TestEmbeddingService:
243
  # Similar texts should be closer
244
  assert cosine(e1, e2) > cosine(e1, e3)
245
 
246
- def test_add_and_search(self):
 
 
 
 
 
 
 
 
 
 
 
 
247
  """Should be able to add evidence and search for similar."""
248
  service = EmbeddingService()
249
- service.add_evidence(
250
  evidence_id="test1",
251
  content="Metformin activates AMPK pathway",
252
  metadata={"source": "pubmed"}
253
  )
254
 
255
- results = service.search_similar("AMPK activation drugs", n_results=1)
256
  assert len(results) == 1
257
  assert "AMPK" in results[0]["content"]
 
 
 
 
 
 
 
258
  ```
259
 
260
  ---
 
82
 
83
  ### 4.2 Embedding Service (`src/services/embeddings.py`)
84
 
85
+ > **CRITICAL: Async Pattern Required**
86
+ >
87
+ > `sentence-transformers` is synchronous and CPU-bound. Running it directly in async code
88
+ > will **block the event loop**, freezing the UI and halting all concurrent operations.
89
+ >
90
+ > **Solution**: Use `asyncio.run_in_executor()` to offload to thread pool.
91
+ > This pattern already exists in `src/tools/websearch.py:28-34`.
92
+
93
  ```python
94
+ """Embedding service for semantic search.
95
+
96
+ IMPORTANT: All public methods are async to avoid blocking the event loop.
97
+ The sentence-transformers model is CPU-bound, so we use run_in_executor().
98
+ """
99
+ import asyncio
100
  from typing import List
101
+
102
  import chromadb
103
  from sentence_transformers import SentenceTransformer
104
 
105
+
106
  class EmbeddingService:
107
+ """Handles text embedding and vector storage.
108
+
109
+ All embedding operations run in a thread pool to avoid blocking
110
+ the async event loop. See src/tools/websearch.py for the pattern.
111
+ """
112
 
113
  def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
114
  self._model = SentenceTransformer(model_name)
 
118
  metadata={"hnsw:space": "cosine"}
119
  )
120
 
121
+ # ─────────────────────────────────────────────────────────────────
122
+ # Sync internal methods (run in thread pool)
123
+ # ─────────────────────────────────────────────────────────────────
124
+
125
+ def _sync_embed(self, text: str) -> List[float]:
126
+ """Synchronous embedding - DO NOT call directly from async code."""
127
  return self._model.encode(text).tolist()
128
 
129
+ def _sync_batch_embed(self, texts: List[str]) -> List[List[float]]:
130
+ """Batch embedding for efficiency - DO NOT call directly from async code."""
131
+ return [e.tolist() for e in self._model.encode(texts)]
132
+
133
+ # ─────────────────────────────────────────────────────────────────
134
+ # Async public methods (safe for event loop)
135
+ # ─────────────────────────────────────────────────────────────────
136
+
137
+ async def embed(self, text: str) -> List[float]:
138
+ """Embed a single text (async-safe).
139
+
140
+ Uses run_in_executor to avoid blocking the event loop.
141
+ """
142
+ loop = asyncio.get_running_loop()
143
+ return await loop.run_in_executor(None, self._sync_embed, text)
144
+
145
+ async def embed_batch(self, texts: List[str]) -> List[List[float]]:
146
+ """Batch embed multiple texts (async-safe, more efficient)."""
147
+ loop = asyncio.get_running_loop()
148
+ return await loop.run_in_executor(None, self._sync_batch_embed, texts)
149
+
150
+ async def add_evidence(self, evidence_id: str, content: str, metadata: dict) -> None:
151
+ """Add evidence to vector store (async-safe)."""
152
+ embedding = await self.embed(content)
153
+ # ChromaDB operations are fast, but wrap for consistency
154
+ loop = asyncio.get_running_loop()
155
+ await loop.run_in_executor(
156
+ None,
157
+ lambda: self._collection.add(
158
+ ids=[evidence_id],
159
+ embeddings=[embedding],
160
+ metadatas=[metadata],
161
+ documents=[content]
162
+ )
163
  )
164
 
165
+ async def search_similar(self, query: str, n_results: int = 5) -> List[dict]:
166
+ """Find semantically similar evidence (async-safe)."""
167
+ query_embedding = await self.embed(query)
168
+
169
+ loop = asyncio.get_running_loop()
170
+ results = await loop.run_in_executor(
171
+ None,
172
+ lambda: self._collection.query(
173
+ query_embeddings=[query_embedding],
174
+ n_results=n_results
175
+ )
176
  )
177
+
178
+ # Handle empty results gracefully
179
+ if not results["ids"] or not results["ids"][0]:
180
+ return []
181
+
182
  return [
183
  {"id": id, "content": doc, "metadata": meta, "distance": dist}
184
  for id, doc, meta, dist in zip(
 
189
  )
190
  ]
191
 
192
+ async def deduplicate(self, new_evidence: List, threshold: float = 0.9) -> List:
193
+ """Remove semantically duplicate evidence (async-safe)."""
194
  unique = []
195
  for evidence in new_evidence:
196
+ similar = await self.search_similar(evidence.content, n_results=1)
197
  if not similar or similar[0]["distance"] > (1 - threshold):
198
  unique.append(evidence)
199
+ await self.add_evidence(
200
  evidence_id=evidence.citation.url,
201
  content=evidence.content,
202
  metadata={"source": evidence.citation.source}
 
206
 
207
  ### 4.3 Enhanced SearchAgent (`src/agents/search_agent.py`)
208
 
209
+ Update SearchAgent to use embeddings. **Note**: All embedding calls are `await`ed:
210
 
211
  ```python
212
  class SearchAgent(BaseAgent):
 
225
  # Execute keyword search
226
  result = await self._handler.execute(query, max_results_per_tool=10)
227
 
228
+ # Semantic deduplication (NEW) - ALL CALLS ARE AWAITED
229
  if self._embeddings:
230
+ # Deduplicate by semantic similarity (async-safe)
231
+ unique_evidence = await self._embeddings.deduplicate(result.evidence)
232
 
233
+ # Also search for semantically related evidence (async-safe)
234
+ related = await self._embeddings.search_similar(query, n_results=5)
235
+
236
+ # Merge related evidence not already in results
237
+ existing_urls = {e.citation.url for e in unique_evidence}
238
+ for item in related:
239
+ if item["id"] not in existing_urls:
240
+ # Reconstruct Evidence from stored data
241
+ # ... merge logic ...
242
 
243
  # ... rest of method ...
244
  ```
 
258
  """
259
  ```
260
 
261
+ ### 4.5 HuggingFace Spaces Deployment
262
+
263
+ > **⚠️ Important for HF Spaces**
264
+ >
265
+ > `sentence-transformers` downloads models (~500MB) to `~/.cache` on first use.
266
+ > HuggingFace Spaces have **ephemeral storage** - the cache is wiped on restart.
267
+ > This causes slow cold starts and bandwidth usage.
268
+
269
+ **Solution**: Pre-download the model in your Dockerfile:
270
+
271
+ ```dockerfile
272
+ # In Dockerfile
273
+ FROM python:3.11-slim
274
+
275
+ # Set cache directory
276
+ ENV HF_HOME=/app/.cache
277
+ ENV TRANSFORMERS_CACHE=/app/.cache
278
+
279
+ # Pre-download the embedding model during build
280
+ RUN pip install sentence-transformers && \
281
+ python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
282
+
283
+ # ... rest of Dockerfile
284
+ ```
285
+
286
+ **Alternative**: Use environment variable to specify persistent path:
287
+
288
+ ```yaml
289
+ # In HF Spaces settings or app.yaml
290
+ env:
291
+ - name: HF_HOME
292
+ value: /data/.cache # Persistent volume
293
+ ```
294
+
295
  ---
296
 
297
  ## 5. Directory Structure After Phase 6
 
313
 
314
  ### 6.1 Unit Tests (`tests/unit/services/test_embeddings.py`)
315
 
316
+ > **Note**: All tests are async since the EmbeddingService methods are async.
317
+
318
  ```python
319
  """Unit tests for EmbeddingService."""
320
  import pytest
321
  from src.services.embeddings import EmbeddingService
322
 
323
+
324
  class TestEmbeddingService:
325
+ @pytest.mark.asyncio
326
+ async def test_embed_returns_vector(self):
327
  """Embedding should return a float vector."""
328
  service = EmbeddingService()
329
+ embedding = await service.embed("metformin diabetes")
330
  assert isinstance(embedding, list)
331
  assert len(embedding) > 0
332
  assert all(isinstance(x, float) for x in embedding)
333
 
334
+ @pytest.mark.asyncio
335
+ async def test_similar_texts_have_close_embeddings(self):
336
  """Semantically similar texts should have similar embeddings."""
337
  service = EmbeddingService()
338
+ e1 = await service.embed("metformin treats diabetes")
339
+ e2 = await service.embed("metformin is used for diabetes treatment")
340
+ e3 = await service.embed("the weather is sunny today")
341
 
342
  # Cosine similarity helper
343
  from numpy import dot
 
347
  # Similar texts should be closer
348
  assert cosine(e1, e2) > cosine(e1, e3)
349
 
350
+ @pytest.mark.asyncio
351
+ async def test_batch_embed_efficient(self):
352
+ """Batch embedding should be more efficient than individual calls."""
353
+ service = EmbeddingService()
354
+ texts = ["text one", "text two", "text three"]
355
+
356
+ # Batch embed
357
+ batch_results = await service.embed_batch(texts)
358
+ assert len(batch_results) == 3
359
+ assert all(isinstance(e, list) for e in batch_results)
360
+
361
+ @pytest.mark.asyncio
362
+ async def test_add_and_search(self):
363
  """Should be able to add evidence and search for similar."""
364
  service = EmbeddingService()
365
+ await service.add_evidence(
366
  evidence_id="test1",
367
  content="Metformin activates AMPK pathway",
368
  metadata={"source": "pubmed"}
369
  )
370
 
371
+ results = await service.search_similar("AMPK activation drugs", n_results=1)
372
  assert len(results) == 1
373
  assert "AMPK" in results[0]["content"]
374
+
375
+ @pytest.mark.asyncio
376
+ async def test_search_similar_empty_collection(self):
377
+ """Search on empty collection should return empty list, not error."""
378
+ service = EmbeddingService()
379
+ results = await service.search_similar("anything", n_results=5)
380
+ assert results == []
381
  ```
382
 
383
  ---
docs/implementation/07_phase_hypothesis.md CHANGED
@@ -116,10 +116,150 @@ class HypothesisAssessment(BaseModel):
116
 
117
  ## 4. Implementation
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  ### 4.1 Hypothesis Prompts (`src/prompts/hypothesis.py`)
120
 
121
  ```python
122
  """Prompts for Hypothesis Agent."""
 
123
 
124
  SYSTEM_PROMPT = """You are a biomedical research scientist specializing in drug repurposing.
125
 
@@ -141,16 +281,35 @@ Example hypothesis format:
141
 
142
  Be specific. Use actual gene/protein names when possible."""
143
 
144
- def format_hypothesis_prompt(query: str, evidence: list) -> str:
145
- """Format prompt for hypothesis generation."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  evidence_text = "\n".join([
147
- f"- {e.citation.title}: {e.content[:300]}..."
148
- for e in evidence[:10]
149
  ])
150
 
151
  return f"""Based on the following evidence about "{query}", generate mechanistic hypotheses.
152
 
153
- ## Evidence
154
  {evidence_text}
155
 
156
  ## Task
@@ -167,7 +326,7 @@ Generate 2-4 hypotheses, prioritized by confidence."""
167
  ```python
168
  """Hypothesis agent for mechanistic reasoning."""
169
  from collections.abc import AsyncIterable
170
- from typing import Any
171
 
172
  from agent_framework import (
173
  AgentRunResponse,
@@ -183,6 +342,9 @@ from src.prompts.hypothesis import SYSTEM_PROMPT, format_hypothesis_prompt
183
  from src.utils.config import settings
184
  from src.utils.models import Evidence, HypothesisAssessment
185
 
 
 
 
186
 
187
  class HypothesisAgent(BaseAgent):
188
  """Generates mechanistic hypotheses based on evidence."""
@@ -190,12 +352,14 @@ class HypothesisAgent(BaseAgent):
190
  def __init__(
191
  self,
192
  evidence_store: dict[str, list[Evidence]],
 
193
  ) -> None:
194
  super().__init__(
195
  name="HypothesisAgent",
196
  description="Generates scientific hypotheses about drug mechanisms to guide research",
197
  )
198
  self._evidence_store = evidence_store
 
199
  self._agent = Agent(
200
  model=settings.llm_provider, # Uses configured LLM
201
  output_type=HypothesisAssessment,
@@ -225,8 +389,11 @@ class HypothesisAgent(BaseAgent):
225
  response_id="hypothesis-no-evidence",
226
  )
227
 
228
- # Generate hypotheses
229
- prompt = format_hypothesis_prompt(query, evidence)
 
 
 
230
  result = await self._agent.run(prompt)
231
  assessment = result.output
232
 
 
116
 
117
  ## 4. Implementation
118
 
119
+ ### 4.0 Text Utilities (`src/utils/text_utils.py`)
120
+
121
+ > **Why These Utilities?**
122
+ >
123
+ > The original spec used arbitrary truncation (`evidence[:10]` and `content[:300]`).
124
+ > This loses important information randomly. These utilities provide:
125
+ > 1. **Sentence-aware truncation** - cuts at sentence boundaries, not mid-word
126
+ > 2. **Diverse evidence selection** - uses embeddings to select varied evidence (MMR)
127
+
128
+ ```python
129
+ """Text processing utilities for evidence handling."""
130
+ from typing import TYPE_CHECKING
131
+
132
+ if TYPE_CHECKING:
133
+ from src.services.embeddings import EmbeddingService
134
+ from src.utils.models import Evidence
135
+
136
+
137
+ def truncate_at_sentence(text: str, max_chars: int = 300) -> str:
138
+ """Truncate text at sentence boundary, preserving meaning.
139
+
140
+ Args:
141
+ text: The text to truncate
142
+ max_chars: Maximum characters (default 300)
143
+
144
+ Returns:
145
+ Text truncated at last complete sentence within limit
146
+ """
147
+ if len(text) <= max_chars:
148
+ return text
149
+
150
+ # Find truncation point
151
+ truncated = text[:max_chars]
152
+
153
+ # Look for sentence endings: . ! ? followed by space or end
154
+ for sep in ['. ', '! ', '? ', '.\n', '!\n', '?\n']:
155
+ last_sep = truncated.rfind(sep)
156
+ if last_sep > max_chars // 2: # Don't truncate too aggressively
157
+ return text[:last_sep + 1].strip()
158
+
159
+ # Fallback: find last period
160
+ last_period = truncated.rfind('.')
161
+ if last_period > max_chars // 2:
162
+ return text[:last_period + 1].strip()
163
+
164
+ # Last resort: truncate at word boundary
165
+ last_space = truncated.rfind(' ')
166
+ if last_space > 0:
167
+ return text[:last_space].strip() + "..."
168
+
169
+ return truncated + "..."
170
+
171
+
172
+ async def select_diverse_evidence(
173
+ evidence: list["Evidence"],
174
+ n: int,
175
+ query: str,
176
+ embeddings: "EmbeddingService | None" = None
177
+ ) -> list["Evidence"]:
178
+ """Select n most diverse and relevant evidence items.
179
+
180
+ Uses Maximal Marginal Relevance (MMR) when embeddings available,
181
+ falls back to relevance_score sorting otherwise.
182
+
183
+ Args:
184
+ evidence: All available evidence
185
+ n: Number of items to select
186
+ query: Original query for relevance scoring
187
+ embeddings: Optional EmbeddingService for semantic diversity
188
+
189
+ Returns:
190
+ Selected evidence items, diverse and relevant
191
+ """
192
+ if not evidence:
193
+ return []
194
+
195
+ if n >= len(evidence):
196
+ return evidence
197
+
198
+ # Fallback: sort by relevance score if no embeddings
199
+ if embeddings is None:
200
+ return sorted(
201
+ evidence,
202
+ key=lambda e: e.relevance_score,
203
+ reverse=True
204
+ )[:n]
205
+
206
+ # MMR: Maximal Marginal Relevance for diverse selection
207
+ # Score = Ξ» * relevance - (1-Ξ») * max_similarity_to_selected
208
+ lambda_param = 0.7 # Balance relevance vs diversity
209
+
210
+ # Get query embedding
211
+ query_emb = await embeddings.embed(query)
212
+
213
+ # Get all evidence embeddings
214
+ evidence_embs = await embeddings.embed_batch([e.content for e in evidence])
215
+
216
+ # Compute relevance scores (cosine similarity to query)
217
+ from numpy import dot
218
+ from numpy.linalg import norm
219
+ cosine = lambda a, b: float(dot(a, b) / (norm(a) * norm(b)))
220
+
221
+ relevance_scores = [cosine(query_emb, emb) for emb in evidence_embs]
222
+
223
+ # Greedy MMR selection
224
+ selected_indices: list[int] = []
225
+ remaining = set(range(len(evidence)))
226
+
227
+ for _ in range(n):
228
+ best_score = float('-inf')
229
+ best_idx = -1
230
+
231
+ for idx in remaining:
232
+ # Relevance component
233
+ relevance = relevance_scores[idx]
234
+
235
+ # Diversity component: max similarity to already selected
236
+ if selected_indices:
237
+ max_sim = max(
238
+ cosine(evidence_embs[idx], evidence_embs[sel])
239
+ for sel in selected_indices
240
+ )
241
+ else:
242
+ max_sim = 0
243
+
244
+ # MMR score
245
+ mmr_score = lambda_param * relevance - (1 - lambda_param) * max_sim
246
+
247
+ if mmr_score > best_score:
248
+ best_score = mmr_score
249
+ best_idx = idx
250
+
251
+ if best_idx >= 0:
252
+ selected_indices.append(best_idx)
253
+ remaining.remove(best_idx)
254
+
255
+ return [evidence[i] for i in selected_indices]
256
+ ```
257
+
258
  ### 4.1 Hypothesis Prompts (`src/prompts/hypothesis.py`)
259
 
260
  ```python
261
  """Prompts for Hypothesis Agent."""
262
+ from src.utils.text_utils import truncate_at_sentence, select_diverse_evidence
263
 
264
  SYSTEM_PROMPT = """You are a biomedical research scientist specializing in drug repurposing.
265
 
 
281
 
282
  Be specific. Use actual gene/protein names when possible."""
283
 
284
+
285
+ async def format_hypothesis_prompt(
286
+ query: str,
287
+ evidence: list,
288
+ embeddings=None
289
+ ) -> str:
290
+ """Format prompt for hypothesis generation.
291
+
292
+ Uses smart evidence selection instead of arbitrary truncation.
293
+
294
+ Args:
295
+ query: The research query
296
+ evidence: All collected evidence
297
+ embeddings: Optional EmbeddingService for diverse selection
298
+ """
299
+ # Select diverse, relevant evidence (not arbitrary first 10)
300
+ selected = await select_diverse_evidence(
301
+ evidence, n=10, query=query, embeddings=embeddings
302
+ )
303
+
304
+ # Format with sentence-aware truncation
305
  evidence_text = "\n".join([
306
+ f"- **{e.citation.title}** ({e.citation.source}): {truncate_at_sentence(e.content, 300)}"
307
+ for e in selected
308
  ])
309
 
310
  return f"""Based on the following evidence about "{query}", generate mechanistic hypotheses.
311
 
312
+ ## Evidence ({len(selected)} papers selected for diversity)
313
  {evidence_text}
314
 
315
  ## Task
 
326
  ```python
327
  """Hypothesis agent for mechanistic reasoning."""
328
  from collections.abc import AsyncIterable
329
+ from typing import TYPE_CHECKING, Any
330
 
331
  from agent_framework import (
332
  AgentRunResponse,
 
342
  from src.utils.config import settings
343
  from src.utils.models import Evidence, HypothesisAssessment
344
 
345
+ if TYPE_CHECKING:
346
+ from src.services.embeddings import EmbeddingService
347
+
348
 
349
  class HypothesisAgent(BaseAgent):
350
  """Generates mechanistic hypotheses based on evidence."""
 
352
  def __init__(
353
  self,
354
  evidence_store: dict[str, list[Evidence]],
355
+ embedding_service: "EmbeddingService | None" = None, # NEW: for diverse selection
356
  ) -> None:
357
  super().__init__(
358
  name="HypothesisAgent",
359
  description="Generates scientific hypotheses about drug mechanisms to guide research",
360
  )
361
  self._evidence_store = evidence_store
362
+ self._embeddings = embedding_service # Used for MMR evidence selection
363
  self._agent = Agent(
364
  model=settings.llm_provider, # Uses configured LLM
365
  output_type=HypothesisAssessment,
 
389
  response_id="hypothesis-no-evidence",
390
  )
391
 
392
+ # Generate hypotheses with diverse evidence selection
393
+ # NOTE: format_hypothesis_prompt is now async
394
+ prompt = await format_hypothesis_prompt(
395
+ query, evidence, embeddings=self._embeddings
396
+ )
397
  result = await self._agent.run(prompt)
398
  assessment = result.output
399
 
docs/implementation/08_phase_report.md CHANGED
@@ -190,10 +190,100 @@ class ResearchReport(BaseModel):
190
 
191
  ## 4. Implementation
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  ### 4.1 Report Prompts (`src/prompts/report.py`)
194
 
195
  ```python
196
  """Prompts for Report Agent."""
 
197
 
198
  SYSTEM_PROMPT = """You are a scientific writer specializing in drug repurposing research reports.
199
 
@@ -210,34 +300,66 @@ A good report:
210
  8. Provides a balanced CONCLUSION
211
  9. Includes properly formatted REFERENCES
212
 
213
- Write in scientific but accessible language. Be specific about evidence strength."""
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
 
 
215
 
216
- def format_report_prompt(
 
217
  query: str,
218
  evidence: list,
219
  hypotheses: list,
220
  assessment: dict,
221
- metadata: dict
 
222
  ) -> str:
223
- """Format prompt for report generation."""
 
 
 
 
 
 
 
224
 
 
 
225
  evidence_summary = "\n".join([
226
- f"- [{e.citation.title}]({e.citation.url}): {e.content[:200]}..."
227
- for e in evidence[:15]
 
 
 
 
 
228
  ])
229
 
230
  hypotheses_summary = "\n".join([
231
  f"- {h.drug} β†’ {h.target} β†’ {h.pathway} β†’ {h.effect} (Confidence: {h.confidence:.0%})"
232
  for h in hypotheses
233
- ])
234
 
235
  return f"""Generate a structured research report for the following query.
236
 
237
  ## Original Query
238
  {query}
239
 
240
- ## Evidence Collected ({len(evidence)} papers)
 
241
  {evidence_summary}
242
 
243
  ## Hypotheses Generated
@@ -252,7 +374,9 @@ def format_report_prompt(
252
  - Sources Searched: {', '.join(metadata.get('sources', []))}
253
  - Search Iterations: {metadata.get('iterations', 0)}
254
 
255
- Generate a complete ResearchReport with all sections filled in."""
 
 
256
  ```
257
 
258
  ### 4.2 Report Agent (`src/agents/report_agent.py`)
@@ -260,7 +384,7 @@ Generate a complete ResearchReport with all sections filled in."""
260
  ```python
261
  """Report agent for generating structured research reports."""
262
  from collections.abc import AsyncIterable
263
- from typing import Any
264
 
265
  from agent_framework import (
266
  AgentRunResponse,
@@ -273,9 +397,13 @@ from agent_framework import (
273
  from pydantic_ai import Agent
274
 
275
  from src.prompts.report import SYSTEM_PROMPT, format_report_prompt
 
276
  from src.utils.config import settings
277
  from src.utils.models import Evidence, MechanismHypothesis, ResearchReport
278
 
 
 
 
279
 
280
  class ReportAgent(BaseAgent):
281
  """Generates structured scientific reports from evidence and hypotheses."""
@@ -283,12 +411,14 @@ class ReportAgent(BaseAgent):
283
  def __init__(
284
  self,
285
  evidence_store: dict[str, list[Evidence]],
 
286
  ) -> None:
287
  super().__init__(
288
  name="ReportAgent",
289
  description="Generates structured scientific research reports with citations",
290
  )
291
  self._evidence_store = evidence_store
 
292
  self._agent = Agent(
293
  model=settings.llm_provider,
294
  output_type=ResearchReport,
@@ -325,19 +455,25 @@ class ReportAgent(BaseAgent):
325
  "iterations": self._evidence_store.get("iteration_count", 0),
326
  }
327
 
328
- # Generate report
329
- prompt = format_report_prompt(
330
  query=query,
331
  evidence=evidence,
332
  hypotheses=hypotheses,
333
  assessment=assessment,
334
- metadata=metadata
 
335
  )
336
 
337
  result = await self._agent.run(prompt)
338
  report = result.output
339
 
340
- # Store report
 
 
 
 
 
341
  self._evidence_store["final_report"] = report
342
 
343
  # Return markdown version
@@ -553,6 +689,94 @@ async def test_report_agent_no_evidence():
553
  response = await agent.run("test query")
554
 
555
  assert "Cannot generate report" in response.messages[0].text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  ```
557
 
558
  ---
 
190
 
191
  ## 4. Implementation
192
 
193
+ ### 4.0 Citation Validation (`src/utils/citation_validator.py`)
194
+
195
+ > **🚨 CRITICAL: Why Citation Validation?**
196
+ >
197
+ > LLMs frequently **hallucinate** citations - inventing paper titles, authors, and URLs
198
+ > that don't exist. For a medical research tool, fake citations are **dangerous**.
199
+ >
200
+ > This validation layer ensures every reference in the report actually exists
201
+ > in the collected evidence.
202
+
203
+ ```python
204
+ """Citation validation to prevent LLM hallucination.
205
+
206
+ CRITICAL: Medical research requires accurate citations.
207
+ This module validates that all references exist in collected evidence.
208
+ """
209
+ import logging
210
+ from typing import TYPE_CHECKING
211
+
212
+ if TYPE_CHECKING:
213
+ from src.utils.models import Evidence, ResearchReport
214
+
215
+ logger = logging.getLogger(__name__)
216
+
217
+
218
+ def validate_references(
219
+ report: "ResearchReport",
220
+ evidence: list["Evidence"]
221
+ ) -> "ResearchReport":
222
+ """Ensure all references actually exist in collected evidence.
223
+
224
+ CRITICAL: Prevents LLM hallucination of citations.
225
+
226
+ Args:
227
+ report: The generated research report
228
+ evidence: All evidence collected during research
229
+
230
+ Returns:
231
+ Report with only valid references (hallucinated ones removed)
232
+ """
233
+ # Build set of valid URLs from evidence
234
+ valid_urls = {e.citation.url for e in evidence}
235
+ valid_titles = {e.citation.title.lower() for e in evidence}
236
+
237
+ validated_refs = []
238
+ removed_count = 0
239
+
240
+ for ref in report.references:
241
+ ref_url = ref.get("url", "")
242
+ ref_title = ref.get("title", "").lower()
243
+
244
+ # Check if URL matches collected evidence
245
+ if ref_url in valid_urls:
246
+ validated_refs.append(ref)
247
+ # Fallback: check title match (URLs might differ slightly)
248
+ elif ref_title and any(ref_title in t or t in ref_title for t in valid_titles):
249
+ validated_refs.append(ref)
250
+ else:
251
+ removed_count += 1
252
+ logger.warning(
253
+ f"Removed hallucinated reference: '{ref.get('title', 'Unknown')}' "
254
+ f"(URL: {ref_url[:50]}...)"
255
+ )
256
+
257
+ if removed_count > 0:
258
+ logger.info(
259
+ f"Citation validation removed {removed_count} hallucinated references. "
260
+ f"{len(validated_refs)} valid references remain."
261
+ )
262
+
263
+ # Update report with validated references
264
+ report.references = validated_refs
265
+ return report
266
+
267
+
268
+ def build_reference_from_evidence(evidence: "Evidence") -> dict:
269
+ """Build a properly formatted reference from evidence.
270
+
271
+ Use this to ensure references match the original evidence exactly.
272
+ """
273
+ return {
274
+ "title": evidence.citation.title,
275
+ "authors": evidence.citation.authors or ["Unknown"],
276
+ "source": evidence.citation.source,
277
+ "date": evidence.citation.date or "n.d.",
278
+ "url": evidence.citation.url,
279
+ }
280
+ ```
281
+
282
  ### 4.1 Report Prompts (`src/prompts/report.py`)
283
 
284
  ```python
285
  """Prompts for Report Agent."""
286
+ from src.utils.text_utils import truncate_at_sentence, select_diverse_evidence
287
 
288
  SYSTEM_PROMPT = """You are a scientific writer specializing in drug repurposing research reports.
289
 
 
300
  8. Provides a balanced CONCLUSION
301
  9. Includes properly formatted REFERENCES
302
 
303
+ Write in scientific but accessible language. Be specific about evidence strength.
304
+
305
+ ─────────────────────────────────────────────────────────────────────────────
306
+ 🚨 CRITICAL CITATION REQUIREMENTS 🚨
307
+ ─────────────────────────────────────────────────────────────────────────────
308
+
309
+ You MUST follow these rules for the References section:
310
+
311
+ 1. You may ONLY cite papers that appear in the Evidence section above
312
+ 2. Every reference URL must EXACTLY match a provided evidence URL
313
+ 3. Do NOT invent, fabricate, or hallucinate any references
314
+ 4. Do NOT modify paper titles, authors, dates, or URLs
315
+ 5. If unsure about a citation, OMIT it rather than guess
316
+ 6. Copy URLs exactly as provided - do not create similar-looking URLs
317
 
318
+ VIOLATION OF THESE RULES PRODUCES DANGEROUS MISINFORMATION.
319
+ ─────────────────────────────────────────────────────────────────────────────"""
320
 
321
+
322
+ async def format_report_prompt(
323
  query: str,
324
  evidence: list,
325
  hypotheses: list,
326
  assessment: dict,
327
+ metadata: dict,
328
+ embeddings=None
329
  ) -> str:
330
+ """Format prompt for report generation.
331
+
332
+ Includes full evidence details for accurate citation.
333
+ """
334
+ # Select diverse evidence (not arbitrary truncation)
335
+ selected = await select_diverse_evidence(
336
+ evidence, n=20, query=query, embeddings=embeddings
337
+ )
338
 
339
+ # Include FULL citation details for each evidence item
340
+ # This helps the LLM create accurate references
341
  evidence_summary = "\n".join([
342
+ f"- **Title**: {e.citation.title}\n"
343
+ f" **URL**: {e.citation.url}\n"
344
+ f" **Authors**: {', '.join(e.citation.authors or ['Unknown'])}\n"
345
+ f" **Date**: {e.citation.date or 'n.d.'}\n"
346
+ f" **Source**: {e.citation.source}\n"
347
+ f" **Content**: {truncate_at_sentence(e.content, 200)}\n"
348
+ for e in selected
349
  ])
350
 
351
  hypotheses_summary = "\n".join([
352
  f"- {h.drug} β†’ {h.target} β†’ {h.pathway} β†’ {h.effect} (Confidence: {h.confidence:.0%})"
353
  for h in hypotheses
354
+ ]) if hypotheses else "No hypotheses generated yet."
355
 
356
  return f"""Generate a structured research report for the following query.
357
 
358
  ## Original Query
359
  {query}
360
 
361
+ ## Evidence Collected ({len(selected)} papers, selected for diversity)
362
+
363
  {evidence_summary}
364
 
365
  ## Hypotheses Generated
 
374
  - Sources Searched: {', '.join(metadata.get('sources', []))}
375
  - Search Iterations: {metadata.get('iterations', 0)}
376
 
377
+ Generate a complete ResearchReport with all sections filled in.
378
+
379
+ REMINDER: Only cite papers from the Evidence section above. Copy URLs exactly."""
380
  ```
381
 
382
  ### 4.2 Report Agent (`src/agents/report_agent.py`)
 
384
  ```python
385
  """Report agent for generating structured research reports."""
386
  from collections.abc import AsyncIterable
387
+ from typing import TYPE_CHECKING, Any
388
 
389
  from agent_framework import (
390
  AgentRunResponse,
 
397
  from pydantic_ai import Agent
398
 
399
  from src.prompts.report import SYSTEM_PROMPT, format_report_prompt
400
+ from src.utils.citation_validator import validate_references # CRITICAL
401
  from src.utils.config import settings
402
  from src.utils.models import Evidence, MechanismHypothesis, ResearchReport
403
 
404
+ if TYPE_CHECKING:
405
+ from src.services.embeddings import EmbeddingService
406
+
407
 
408
  class ReportAgent(BaseAgent):
409
  """Generates structured scientific reports from evidence and hypotheses."""
 
411
  def __init__(
412
  self,
413
  evidence_store: dict[str, list[Evidence]],
414
+ embedding_service: "EmbeddingService | None" = None, # For diverse selection
415
  ) -> None:
416
  super().__init__(
417
  name="ReportAgent",
418
  description="Generates structured scientific research reports with citations",
419
  )
420
  self._evidence_store = evidence_store
421
+ self._embeddings = embedding_service
422
  self._agent = Agent(
423
  model=settings.llm_provider,
424
  output_type=ResearchReport,
 
455
  "iterations": self._evidence_store.get("iteration_count", 0),
456
  }
457
 
458
+ # Generate report (format_report_prompt is now async)
459
+ prompt = await format_report_prompt(
460
  query=query,
461
  evidence=evidence,
462
  hypotheses=hypotheses,
463
  assessment=assessment,
464
+ metadata=metadata,
465
+ embeddings=self._embeddings,
466
  )
467
 
468
  result = await self._agent.run(prompt)
469
  report = result.output
470
 
471
+ # ═══════════════════════════════════════════════════════════════════
472
+ # 🚨 CRITICAL: Validate citations to prevent hallucination
473
+ # ═══════════════════════════════════════════════════════════════════
474
+ report = validate_references(report, evidence)
475
+
476
+ # Store validated report
477
  self._evidence_store["final_report"] = report
478
 
479
  # Return markdown version
 
689
  response = await agent.run("test query")
690
 
691
  assert "Cannot generate report" in response.messages[0].text
692
+
693
+
694
+ # ═══════════════════════════════════════════════════════════════════════════
695
+ # 🚨 CRITICAL: Citation Validation Tests
696
+ # ��══════════════════════════════════════════════════════════════════════════
697
+
698
+ @pytest.mark.asyncio
699
+ async def test_report_agent_removes_hallucinated_citations(sample_evidence):
700
+ """ReportAgent should remove citations not in evidence."""
701
+ from src.utils.citation_validator import validate_references
702
+
703
+ # Create report with mix of valid and hallucinated references
704
+ report_with_hallucinations = ResearchReport(
705
+ title="Test Report",
706
+ executive_summary="This is a test report for citation validation...",
707
+ research_question="Testing citation validation",
708
+ methodology=ReportSection(title="Methodology", content="Test"),
709
+ hypotheses_tested=[],
710
+ mechanistic_findings=ReportSection(title="Mechanistic", content="Test"),
711
+ clinical_findings=ReportSection(title="Clinical", content="Test"),
712
+ drug_candidates=["TestDrug"],
713
+ limitations=["Test limitation"],
714
+ conclusion="Test conclusion",
715
+ references=[
716
+ # Valid reference (matches sample_evidence)
717
+ {
718
+ "title": "Metformin mechanisms",
719
+ "url": "https://pubmed.ncbi.nlm.nih.gov/12345/",
720
+ "authors": ["Smith J", "Jones A"],
721
+ "date": "2023",
722
+ "source": "pubmed"
723
+ },
724
+ # HALLUCINATED reference (URL doesn't exist in evidence)
725
+ {
726
+ "title": "Fake Paper That Doesn't Exist",
727
+ "url": "https://fake-journal.com/made-up-paper",
728
+ "authors": ["Hallucinated A"],
729
+ "date": "2024",
730
+ "source": "fake"
731
+ },
732
+ # Another HALLUCINATED reference
733
+ {
734
+ "title": "Invented Research",
735
+ "url": "https://pubmed.ncbi.nlm.nih.gov/99999999/",
736
+ "authors": ["NotReal B"],
737
+ "date": "2025",
738
+ "source": "pubmed"
739
+ }
740
+ ],
741
+ sources_searched=["pubmed"],
742
+ total_papers_reviewed=1,
743
+ search_iterations=1,
744
+ confidence_score=0.5
745
+ )
746
+
747
+ # Validate - should remove hallucinated references
748
+ validated_report = validate_references(report_with_hallucinations, sample_evidence)
749
+
750
+ # Only the valid reference should remain
751
+ assert len(validated_report.references) == 1
752
+ assert validated_report.references[0]["title"] == "Metformin mechanisms"
753
+ assert "Fake Paper" not in str(validated_report.references)
754
+
755
+
756
+ def test_citation_validator_handles_empty_references():
757
+ """Citation validator should handle reports with no references."""
758
+ from src.utils.citation_validator import validate_references
759
+
760
+ report = ResearchReport(
761
+ title="Empty Refs Report",
762
+ executive_summary="This report has no references...",
763
+ research_question="Testing empty refs",
764
+ methodology=ReportSection(title="Methodology", content="Test"),
765
+ hypotheses_tested=[],
766
+ mechanistic_findings=ReportSection(title="Mechanistic", content="Test"),
767
+ clinical_findings=ReportSection(title="Clinical", content="Test"),
768
+ drug_candidates=[],
769
+ limitations=[],
770
+ conclusion="Test",
771
+ references=[], # Empty!
772
+ sources_searched=[],
773
+ total_papers_reviewed=0,
774
+ search_iterations=0,
775
+ confidence_score=0.0
776
+ )
777
+
778
+ validated = validate_references(report, [])
779
+ assert validated.references == []
780
  ```
781
 
782
  ---