diff --git a/.env b/.env new file mode 100644 index 0000000000000000000000000000000000000000..68f8cc46430205dbee57f1df4a9a2ada820baa8f --- /dev/null +++ b/.env @@ -0,0 +1,50 @@ +# Environment variables for GAIA Multi-Agent Framework + +# API Keys +GEMINI_API_KEY="AIzaSyDOQRtAJd-Kj-H6VT_0t38cZTz4Halgi3U" # For Google AI Studio +GOOGLE_API_KEY="AIzaSyACcl4uzlyqz4glW-_uCj0xGPSSH0uloAY" # For Google Custom Search JSON API +GOOGLE_CSE_ID="004c6b8673f0c4dd5" # For Google Custom Search Engine ID +TAVILY_API_KEY="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" # For Tavily Search API +ALPAFLOW_OPENAI_API_KEY="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" # For o4-mini model (or other OpenAI compatible endpoint) +WOLFRAM_ALPHA_APP_ID="YOUR_WOLFRAM_ALPHA_APP_ID" # For WolframAlpha API + +# GAIA Benchmark API +GAIA_API_URL="https://agents-course-unit4-scoring.hf.space" + +# Model Names (using defaults from original code, can be overridden) +ROLE_EMBED_MODEL="Snowflake/snowflake-arctic-embed-l-v2.0" +ROLE_RERANKER_MODEL="Alibaba-NLP/gte-multilingual-reranker-base" +ROLE_PROMPT_DATASET="fka/awesome-chatgpt-prompts" +ROLE_LLM_MODEL="models/gemini-1.5-pro" + +IMAGE_ANALYZER_LLM_MODEL="models/gemini-1.5-pro" + +VERIFIER_LLM_MODEL="models/gemini-2.0-flash" +VERIFIER_AGENT_LLM_MODEL="models/gemini-1.5-pro" +VERIFIER_CONFIDENCE_THRESHOLD="0.7" + +RESEARCH_AGENT_LLM_MODEL="models/gemini-1.5-pro" +# RESEARCH_AGENT_CHROME_NO_SANDBOX="true" # Example config for research agent browser +# RESEARCH_AGENT_CHROME_DISABLE_DEV_SHM="true" + +TEXT_ANALYZER_LLM_MODEL="models/gemini-1.5-pro" +TEXT_ANALYZER_AGENT_LLM_MODEL="models/gemini-1.5-pro" + +REASONING_TOOL_LLM_MODEL="o4-mini" +REASONING_TOOL_API_KEY_ENV="ALPAFLOW_OPENAI_API_KEY" # Env var name containing the key for reasoning tool LLM +REASONING_AGENT_LLM_MODEL="models/gemini-1.5-pro" + +PLANNER_TOOL_LLM_MODEL="models/gemini-1.5-pro" +PLANNER_AGENT_LLM_MODEL="models/gemini-1.5-pro" + +CODE_GEN_LLM_MODEL="o4-mini" +CODE_GEN_API_KEY_ENV="ALPAFLOW_OPENAI_API_KEY" # Env var name containing the key for code gen LLM +CODE_AGENT_LLM_MODEL="models/gemini-1.5-pro" + +MATH_AGENT_LLM_MODEL="models/gemini-1.5-pro" + +# New Feature Config (Placeholders) +YOUTUBE_CHUNK_DURATION_SECONDS="60" +TRANSCRIPTION_WHISPER_CPP_PATH="/path/to/whisper.cpp/main" # Example path +TRANSCRIPTION_WHISPER_MODEL_PATH="/path/to/whisper/model.bin" # Example path + diff --git a/.gitattributes b/.gitattributes index a77b0a9825cf66920a1bbb064659dc5a2b5ae2bd..a6344aac8c09253b3b630fb776ae94478aa0275b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text -stockfish filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore deleted file mode 100644 index d836f46114dfc188ab94cb238ffcd99184dc625a..0000000000000000000000000000000000000000 --- a/.gitignore +++ /dev/null @@ -1,139 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -pytest_cache/ -nosetests.xml -coverage.xml -*.cover -*.py,cover - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -Pipfile.lock - -# poetry -poetry.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# virtualenv / venv -venv/ -ENV/ -env/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# profiling data -.prof - -# IDEs and editors -## VS Code -.vscode/ - -## PyCharm -.idea/ - -## Sublime Text -*.sublime-project -*.sublime-workspace - -## Emacs -*~ -\.#* - -## Vim -*.swp -*.swo -Session.vim - -# Environment variables file -.env -.venv - -# Logs -*.log \ No newline at end of file diff --git a/README.md b/README.md index 4c55fbc602935ed89eca59d00979c2d563fb30d6..7842e2e386a6f9ca52b6ce1df5fa2e266ffd6b86 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@ sdk: gradio sdk_version: 5.28.0 app_file: app.py pinned: false -hf_oauth: true --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/__pycache__/app.cpython-311.pyc b/__pycache__/app.cpython-311.pyc deleted file mode 100644 index 450c836f64b8f886f5d28e11438fb2794af12e95..0000000000000000000000000000000000000000 Binary files a/__pycache__/app.cpython-311.pyc and /dev/null differ diff --git a/agents/__pycache__/__init__.cpython-311.pyc b/agents/__pycache__/__init__.cpython-311.pyc index 35ffcfdbbe7694a3ee66503b3131aab976c96b9e..033e36aa66dacc59c7464e3f55a5b9cb280a01e3 100644 Binary files a/agents/__pycache__/__init__.cpython-311.pyc and b/agents/__pycache__/__init__.cpython-311.pyc differ diff --git a/agents/__pycache__/advanced_validation_agent.cpython-311.pyc b/agents/__pycache__/advanced_validation_agent.cpython-311.pyc index b7702223fb5a2269735f601ecbce0f5d1c262163..34775b02f74c38fe5f4d73c2b1f71b5627be060b 100644 Binary files a/agents/__pycache__/advanced_validation_agent.cpython-311.pyc and b/agents/__pycache__/advanced_validation_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/code_agent.cpython-311.pyc b/agents/__pycache__/code_agent.cpython-311.pyc index addb635703e44fb287703e14ae7c909aa94aa551..f23103bb77b9fd177e6203e4747e3f36845ff067 100644 Binary files a/agents/__pycache__/code_agent.cpython-311.pyc and b/agents/__pycache__/code_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/figure_interpretation_agent.cpython-311.pyc b/agents/__pycache__/figure_interpretation_agent.cpython-311.pyc index a8cab4dd89ee348660d1adb6435c87b651ce03e3..86dabf95e7caf4da7309feb9795ed772bd7052e5 100644 Binary files a/agents/__pycache__/figure_interpretation_agent.cpython-311.pyc and b/agents/__pycache__/figure_interpretation_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/image_analyzer_agent.cpython-311.pyc b/agents/__pycache__/image_analyzer_agent.cpython-311.pyc index 84cd46820f341662f68b6e0258c01f1067f5d360..2a89d82c5feef752666da0bfd3f67f38445a4f29 100644 Binary files a/agents/__pycache__/image_analyzer_agent.cpython-311.pyc and b/agents/__pycache__/image_analyzer_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/long_context_management_agent.cpython-311.pyc b/agents/__pycache__/long_context_management_agent.cpython-311.pyc index b3acea542bea655d152491280ed2df82a2f0c15e..81c08f432495d360aa80f16bd95926e43ada0979 100644 Binary files a/agents/__pycache__/long_context_management_agent.cpython-311.pyc and b/agents/__pycache__/long_context_management_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/math_agent.cpython-311.pyc b/agents/__pycache__/math_agent.cpython-311.pyc index 607f3f2fc7859a45b86516d5feae9cf57d6a3611..07089cc758435f8d88af2aa8266a5be943ae3e39 100644 Binary files a/agents/__pycache__/math_agent.cpython-311.pyc and b/agents/__pycache__/math_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/planner_agent.cpython-311.pyc b/agents/__pycache__/planner_agent.cpython-311.pyc index 4dcebcb4e3ee7cd00db199b489ccd830e52a25e1..4464cdf36e1037daa629e48aabf1f5d0132782e6 100644 Binary files a/agents/__pycache__/planner_agent.cpython-311.pyc and b/agents/__pycache__/planner_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/reasoning_agent.cpython-311.pyc b/agents/__pycache__/reasoning_agent.cpython-311.pyc index 32635a1bf357e9e2a8183e8612f7f2cc48f36bbd..dccf57a15110609e1ed76c4bb6ec62539ee667e0 100644 Binary files a/agents/__pycache__/reasoning_agent.cpython-311.pyc and b/agents/__pycache__/reasoning_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/research_agent.cpython-311.pyc b/agents/__pycache__/research_agent.cpython-311.pyc index 16b302de4274d5215a998d48b53fd9f101eeba1f..da7bf66e4cc2074f8384f2544f914c396548af18 100644 Binary files a/agents/__pycache__/research_agent.cpython-311.pyc and b/agents/__pycache__/research_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/role_agent.cpython-311.pyc b/agents/__pycache__/role_agent.cpython-311.pyc index 09da61dfc13e004c7722d5145198165c993a9684..ee414ec48bbfbab5eed68fafe20e91ba9728730a 100644 Binary files a/agents/__pycache__/role_agent.cpython-311.pyc and b/agents/__pycache__/role_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/text_analyzer_agent.cpython-311.pyc b/agents/__pycache__/text_analyzer_agent.cpython-311.pyc index 423b5dc12c84e707f548e6952b7f0915f222bb5d..62e1f0abe8398bc875c59f3b474407e04cbc8981 100644 Binary files a/agents/__pycache__/text_analyzer_agent.cpython-311.pyc and b/agents/__pycache__/text_analyzer_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/verifier_agent.cpython-311.pyc b/agents/__pycache__/verifier_agent.cpython-311.pyc index 85edc3760484176b019d9c405bef1303af1b1d2a..84021abcb33db3306c99c0873021d13e339abe0b 100644 Binary files a/agents/__pycache__/verifier_agent.cpython-311.pyc and b/agents/__pycache__/verifier_agent.cpython-311.pyc differ diff --git a/agents/__pycache__/video_analyzer_agent.cpython-311.pyc b/agents/__pycache__/video_analyzer_agent.cpython-311.pyc deleted file mode 100644 index 2535cca910cfbf10e0389f3904e8484393224a6b..0000000000000000000000000000000000000000 Binary files a/agents/__pycache__/video_analyzer_agent.cpython-311.pyc and /dev/null differ diff --git a/agents/advanced_validation_agent.py b/agents/advanced_validation_agent.py index 38c08113b08cbe7b1952c01b9fc472ab502fb1ac..1e645a88bff05dfdaf1782dd2de7e8b15dcb1e02 100644 --- a/agents/advanced_validation_agent.py +++ b/agents/advanced_validation_agent.py @@ -2,12 +2,16 @@ import os import logging import json from typing import List, Dict, Optional, Union +from dotenv import load_dotenv from llama_index.core.agent.workflow import ReActAgent from llama_index.core.tools import FunctionTool from llama_index.llms.google_genai import GoogleGenAI # Assuming research_agent might be needed for handoff, but not directly imported +# Load environment variables +load_dotenv() + # Setup logging logger = logging.getLogger(__name__) @@ -45,7 +49,7 @@ def cross_reference_check(claim: str, sources_content: List[Dict[str, str]]) -> return {"error": "No source content provided for cross-referencing."} # LLM configuration - llm_model = os.getenv("VALIDATION_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Use a capable model + llm_model = os.getenv("VALIDATION_LLM_MODEL", "models/gemini-1.5-pro") # Use a capable model gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for cross-referencing LLM.") @@ -53,7 +57,7 @@ def cross_reference_check(claim: str, sources_content: List[Dict[str, str]]) -> results = [] try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model) logger.info(f"Using cross-referencing LLM: {llm_model}") for i, source in enumerate(sources_content): @@ -114,7 +118,7 @@ def logical_consistency_check(text: str) -> Dict[str, Union[bool, str, List[str] logger.info(f"Checking logical consistency for text (length: {len(text)} chars).") # LLM configuration - llm_model = os.getenv("VALIDATION_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + llm_model = os.getenv("VALIDATION_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for consistency check LLM.") @@ -138,7 +142,7 @@ def logical_consistency_check(text: str) -> Dict[str, Union[bool, str, List[str] ) try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model, response_mime_type="application/json") logger.info(f"Using consistency check LLM: {llm_model}") response = llm.complete(prompt) @@ -174,7 +178,7 @@ def bias_detection(text: str, source_context: Optional[str] = None) -> Dict[str, logger.info(f"Detecting bias in text (length: {len(text)} chars). Context provided: {source_context is not None}") # LLM configuration - llm_model = os.getenv("VALIDATION_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + llm_model = os.getenv("VALIDATION_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for bias detection LLM.") @@ -203,7 +207,7 @@ def bias_detection(text: str, source_context: Optional[str] = None) -> Dict[str, ) try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model, response_mime_type="application/json") logger.info(f"Using bias detection LLM: {llm_model}") response = llm.complete(prompt) @@ -300,7 +304,7 @@ def initialize_advanced_validation_agent() -> ReActAgent: logger.info("Initializing AdvancedValidationAgent...") # Configuration for the agent's main LLM - agent_llm_model = os.getenv("VALIDATION_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Use Pro for main agent logic + agent_llm_model = os.getenv("VALIDATION_AGENT_LLM_MODEL", "models/gemini-1.5-pro") # Use Pro for main agent logic gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: @@ -308,7 +312,7 @@ def initialize_advanced_validation_agent() -> ReActAgent: raise ValueError("GEMINI_API_KEY must be set for AdvancedValidationAgent") try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model) logger.info(f"Using agent LLM: {agent_llm_model}") # Load system prompt @@ -343,6 +347,7 @@ def initialize_advanced_validation_agent() -> ReActAgent: llm=llm, system_prompt=system_prompt, can_handoff_to=valid_handoffs, + verbose=True # Enable verbose logging ) logger.info("AdvancedValidationAgent initialized successfully.") return agent diff --git a/agents/annexe_autres_elements.tex b/agents/annexe_autres_elements.tex deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/agents/code_agent.py b/agents/code_agent.py index af18d7eaab111acc101d03f12fc9249d2787f6ee..7ca2f049b5866d709866d200fdfe412908384cdb 100644 --- a/agents/code_agent.py +++ b/agents/code_agent.py @@ -1,5 +1,6 @@ import os import logging +from dotenv import load_dotenv from llama_index.core.agent.workflow import CodeActAgent, ReActAgent from llama_index.core.tools import FunctionTool @@ -7,6 +8,9 @@ from llama_index.llms.google_genai import GoogleGenAI from llama_index.llms.openai import OpenAI from llama_index.tools.code_interpreter import CodeInterpreterToolSpec +# Load environment variables +load_dotenv() + # Setup logging logger = logging.getLogger(__name__) @@ -43,10 +47,12 @@ def generate_python_code(prompt: str) -> str: # Configuration for code generation LLM gen_llm_model = os.getenv("CODE_GEN_LLM_MODEL", "o4-mini") - gen_api_key = os.getenv("OPENAI_API_KEY") + gen_api_key_env = os.getenv("CODE_GEN_API_KEY_ENV", "ALPAFLOW_OPENAI_API_KEY") + gen_api_key = os.getenv(gen_api_key_env) if not gen_api_key: - raise ValueError("OPENAI_API_KEY environment variable is not set.") + logger.error(f"{gen_api_key_env} not found in environment variables for code generation LLM.") + raise ValueError(f"{gen_api_key_env} must be set for code generation") # Load the prompt template default_gen_prompt_template = ("You are a helpful assistant that writes Python code. " @@ -62,10 +68,7 @@ def generate_python_code(prompt: str) -> str: try: llm = OpenAI( model=gen_llm_model, - api_key=gen_api_key, - reasoning_effort="high", - temperature=0.1, - max_tokens=16384 + api_key=gen_api_key ) logger.info(f"Using code generation LLM: {gen_llm_model}") generated_code = llm.complete(input_prompt) @@ -116,7 +119,7 @@ def initialize_code_agent() -> ReActAgent: logger.info("Initializing CodeAgent...") # Configuration for the agent's main LLM - agent_llm_model = os.getenv("CODE_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + agent_llm_model = os.getenv("CODE_AGENT_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: @@ -127,96 +130,29 @@ def initialize_code_agent() -> ReActAgent: llm = GoogleGenAI( api_key=gemini_api_key, model=agent_llm_model, - temperature=0.10 ) logger.info(f"Using agent LLM: {agent_llm_model}") # Load system prompt (consider loading from file) default_system_prompt = """\ - You are CodeAgent, a specialist in generating and executing Python code. Your mission: - - 1. **Thought**: Think step-by-step before acting and state your reasoning. - 2. **Code Generation**: To produce code, call `python_code_generator` with a concise, unambiguous prompt. Review the generated code for correctness and safety. - 3. **Execution & Testing**: To execute or test code, call `code_interpreter`. Provide the complete code snippet. Analyze its output (stdout, stderr, result) to verify functionality and debug errors. - 4. **Iteration**: If execution fails or the result is incorrect, analyze the error, think about the fix, generate corrected code using `python_code_generator`, and execute again using `code_interpreter`. - 5. **Tool Use**: Always adhere strictly to each tool’s input/output format. - 6. **Final Output**: Once the code works correctly and achieves the goal, output *only* the final functional code or the final execution result, as appropriate for the task. - 7. **Hand-Off**: If further logical reasoning or verification is needed, delegate to **reasoning_agent**. Otherwise, pass your final output to **planner_agent** for synthesis. - - **Special Instructions for Chess-Related Tasks**: - - Prioritize using the Stockfish engine to solve chess problems. Ubuntu installation: `sudo apt-get install stockfish` so path is `/usr/games/stockfish` - - Use `python-chess` to represent boards, generate and validate moves, and parse PGN/FEN. - - **Available Python Packages**: - - - beautifulsoup4: HTML/XML parsing and lightweight web scraping - - certifi: Mozilla CA bundle for secure TLS/SSL requests - - datasets: Hugging Face dataset loading and streaming - - duckdb: In‑process OLAP SQL engine (analytics, Parquet, Arrow) - - ffmpeg-python: Wrapper around FFmpeg for audio/video operations - - gradio[oauth]: Rapid web‑UI prototyping with optional OAuth - - helium: High‑level Selenium / browser automation toolkit - - huggingface: Interact with Hugging Face Hub models, datasets, spaces - - imageio: Read and write images, GIFs, MP4s, volumes, etc. - - matplotlib: 2‑D plotting (figures, axes, annotations) - - numpy: N‑dimensional arrays and vectorized math - - openai-whisper: Speech‑to‑text transcription - - opencv-python: Computer vision, image/video processing - - openpyxl: Excel .xlsx read/write, styles, formulas - - pandas: DataFrames, time series, CSV/Parquet I/O - - pyarrow: Apache Arrow tables, Parquet, Flight RPC - - pygame: Simple 2‑D game/graphics engine (SDL based) - - python-chess: Chess move generation, PGN/FEN handling, engine UCI integration - - requests: HTTP/HTTPS client with sessions and retries - - scikit-learn: Machine‑learning algorithms, preprocessing, pipelines - - scipy: Scientific computing, optimization, signal processing - - seaborn: Statistical visualization on top of matplotlib - - sqlalchemy: SQL ORM and core engine for many databases - - statsmodels: Econometrics and statistical modeling (GLM, ARIMA) - - stockfish: UCI interface to Stockfish chess engine - - sympy: Symbolic math, algebra, calculus CAS - - youtube-transcript-api: Fetch YouTube video transcripts via API - - yt-dlp: Download videos/playlists from YouTube and other sites - """ - - system_prompt = load_prompt_from_file("code_agent_system_prompt.txt", default_system_prompt) + You are CodeAgent, a specialist in generating and executing Python code. Your mission: + + 1. **Thought**: Think step-by-step before acting and state your reasoning. + 2. **Code Generation**: To produce code, call `python_code_generator` with a concise, unambiguous prompt. Review the generated code for correctness and safety. + 3. **Execution & Testing**: To execute or test code, call `code_interpreter`. Provide the complete code snippet. Analyze its output (stdout, stderr, result) to verify functionality and debug errors. + 4. **Iteration**: If execution fails or the result is incorrect, analyze the error, think about the fix, generate corrected code using `python_code_generator`, and execute again using `code_interpreter`. + 5. **Tool Use**: Always adhere strictly to each tool’s input/output format. + 6. **Final Output**: Once the code works correctly and achieves the goal, output *only* the final functional code or the final execution result, as appropriate for the task. + 7. **Hand-Off**: If further logical reasoning or verification is needed, delegate to **reasoning_agent**. Otherwise, pass your final output to **planner_agent** for synthesis. + """ + # system_prompt = load_prompt_from_file("code_agent_system_prompt.txt", default_system_prompt) + system_prompt = default_system_prompt # Using inline for now agent = ReActAgent( name="code_agent", description=( - "Generates Python code using `python_code_generator` and executes it safely with " - "`code_interpreter`, then iteratively debugs and refines the code from run-time feedback.\n\n" - "The agent can leverage the following pre-installed packages:\n" - "- beautifulsoup4>=4.13.4 : HTML/XML parsing and lightweight web scraping\n" - "- certifi>=2025.4.26 : Mozilla CA bundle for secure TLS/SSL requests\n" - "- datasets>=3.5.1 : Hugging Face dataset loading and streaming\n" - "- duckdb>=1.2.2 : In‑process OLAP SQL engine (analytics, Parquet, Arrow)\n" - "- ffmpeg-python>=0.2.0 : Wrapper around FFmpeg for audio/video operations\n" - "- gradio[oauth]>=5.28.0 : Rapid web‑UI prototyping with optional OAuth\n" - "- helium>=5.1.1 : High‑level Selenium / browser automation toolkit\n" - "- huggingface>=0.0.1 : Interact with Hugging Face Hub models, datasets, spaces\n" - "- imageio>=2.37.0 : Read and write images, GIFs, MP4s, volumes, etc.\n" - "- matplotlib>=3.10.1 : 2‑D plotting (figures, axes, annotations)\n" - "- numpy>=2.2.5 : N‑dimensional arrays and vectorized math\n" - "- openai-whisper>=20240930 : Speech‑to‑text transcription\n" - "- opencv-python>=4.11.0.86 : Computer vision, image/video processing\n" - "- openpyxl>=3.1.5 : Excel .xlsx read/write, styles, formulas\n" - "- pandas>=2.2.3 : DataFrames, time series, CSV/Parquet I/O\n" - "- pyarrow>=20.0.0 : Apache Arrow tables, Parquet, Flight RPC\n" - "- pygame>=2.6.1 : Simple 2‑D game/graphics engine (SDL based)\n" - "- python-chess>=1.999 : Chess move generation, PGN/FEN handling, engines\n" - "- requests>=2.32.3 : HTTP/HTTPS client with sessions and retries\n" - "- scikit-learn>=1.6.1 : Machine‑learning algorithms, preprocessing, pipelines\n" - "- scipy>=1.15.2 : Scientific computing, optimization, signal processing\n" - "- seaborn>=0.13.2 : Statistical visualization on top of matplotlib\n" - "- sqlalchemy>=2.0.40 : SQL ORM and core engine for many databases\n" - "- statsmodels>=0.14.4 : Econometrics and statistical modeling (GLM, ARIMA)\n" - "- stockfish==3.28.0 : UCI interface to Stockfish chess engine\n" - "- sympy>=1.14.0 : Symbolic math, algebra, calculus CAS\n" - "- youtube-transcript-api>=1.0.3 : Fetch YouTube video transcripts via API\n" - "- yt-dlp>=2025.3.31 : Download videos/playlists from YouTube and other sites\n\n" - "Additionally, the `stockfish` package enables the agent to solve chess problems by analyzing positions, " - "identifying tactical motifs, and calculating optimal move sequences, making it a valuable tool for chess training and analysis." + "Generates Python code using `python_code_generator` and executes it safely using `code_interpreter`. " + "Iteratively debugs and refines code based on execution results." ), # REMOVED: code_execute_fn - Execution is handled by the code_interpreter tool via the agent loop. tools=[ diff --git a/agents/figure_interpretation_agent.py b/agents/figure_interpretation_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..81f0c69a1b8b348937d44f5e1959dabf77972812 --- /dev/null +++ b/agents/figure_interpretation_agent.py @@ -0,0 +1,303 @@ +import os +import logging +from typing import List, Dict, Optional, Union +from dotenv import load_dotenv + +from llama_index.core.agent.workflow import ReActAgent +from llama_index.core.schema import ImageDocument +from llama_index.core.tools import FunctionTool +from llama_index.llms.google_genai import GoogleGenAI + +# Load environment variables +load_dotenv() + +# Setup logging +logger = logging.getLogger(__name__) + +# Helper function to load prompt from file +def load_prompt_from_file(filename: str, default_prompt: str) -> str: + """Loads a prompt from a text file.""" + try: + script_dir = os.path.dirname(__file__) + prompt_path = os.path.join(script_dir, filename) + with open(prompt_path, "r") as f: + prompt = f.read() + logger.info(f"Successfully loaded prompt from {prompt_path}") + return prompt + except FileNotFoundError: + logger.warning(f"Prompt file {filename} not found at {prompt_path}. Using default.") + return default_prompt + except Exception as e: + logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True) + return default_prompt + +# --- Core Figure Interpretation Logic (using Multi-Modal LLM) --- + +def interpret_figure_with_llm(image_path: str, request: str) -> str: + """Interprets a figure in an image based on a specific request using a multi-modal LLM. + Args: + image_path (str): Path to the image file containing the figure. + request (str): The specific question or interpretation task (e.g., "Describe this chart", + "Extract sales for Q3", "Identify the main trend"). + Returns: + str: The interpretation result or an error message. + """ + logger.info(f"Interpreting figure in image: {image_path} with request: {request}") + + # Check if image exists + if not os.path.exists(image_path): + logger.error(f"Image file not found: {image_path}") + return f"Error: Image file not found at {image_path}" + + # LLM configuration (Must be a multi-modal model) + # Ensure the selected model supports image input (e.g., gemini-1.5-pro) + llm_model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro") + gemini_api_key = os.getenv("GEMINI_API_KEY") + if not gemini_api_key: + logger.error("GEMINI_API_KEY not found for figure interpretation LLM.") + return "Error: GEMINI_API_KEY not set." + + try: + # Initialize the multi-modal LLM + llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model_name) + logger.info(f"Using figure interpretation LLM: {llm_model_name}") + + # Prepare the prompt for the multi-modal LLM + # The prompt needs to guide the LLM to act as the figure interpreter + # based on the specific request. + prompt = ( + f"You are an expert figure interpreter. Analyze the provided image containing a chart, graph, diagram, or table. " + f"Focus *only* on the visual information present in the image. " + f"Fulfill the following request accurately and concisely:\n\n" + f"REQUEST: {request}\n\n" + f"Based *only* on the image, provide the answer:" + ) + + # Load the image data (LlamaIndex integration might handle this differently depending on version) + # Assuming a method to load image data compatible with the LLM call + # This might involve using ImageBlock or similar structures in newer LlamaIndex versions. + # For simplicity here, we assume the LLM call can handle a path or loaded image object. + + # Example using complete (adjust based on actual LlamaIndex multi-modal API) + # Note: The exact API for multi-modal completion might vary. + # This is a conceptual example. + from llama_index.core import SimpleDirectoryReader # Example import + + # Load the image document + reader = SimpleDirectoryReader(input_files=[image_path]) + image_documents = reader.load_data() + + if not image_documents or not isinstance(image_documents[0], ImageDocument): + logger.error(f"Failed to load image as ImageDocument: {image_path}") + return f"Error: Could not load image file {image_path} for analysis." + + # Make the multi-modal completion call + response = llm.complete( + prompt=prompt, + image_documents=image_documents # Pass the loaded image document(s) + ) + + interpretation = response.text.strip() + logger.info("Figure interpretation successful.") + return interpretation + + except FileNotFoundError: + # This might be redundant due to the initial check, but good practice + logger.error(f"Image file not found during LLM call: {image_path}") + return f"Error: Image file not found at {image_path}" + except ImportError as ie: + logger.error(f"Missing library for multi-modal processing: {ie}") + return f"Error: Missing required library for image processing ({ie})." + except Exception as e: + # Catch potential API errors or other issues + logger.error(f"LLM call failed during figure interpretation: {e}", exc_info=True) + # Check if the error suggests the model doesn't support images + if "does not support image input" in str(e).lower(): + logger.error(f"The configured model {llm_model_name} does not support image input.") + return f"Error: The configured LLM ({llm_model_name}) does not support image input. Please configure a multi-modal model." + return f"Error during figure interpretation: {e}" + +# --- Tool Definitions (Wrapping the core logic) --- +# These tools essentially pass the request to the core LLM function. + +def describe_figure_tool_fn(image_path: str) -> str: + "Provides a general description of the figure in the image (type, elements, topic)." + return interpret_figure_with_llm(image_path, "Describe this figure, including its type, main elements (axes, labels, legend), and overall topic.") + +def extract_data_points_tool_fn(image_path: str, data_request: str) -> str: + "Extracts specific data points or values from the figure in the image." + return interpret_figure_with_llm(image_path, f"Extract the following data points/values from the figure: {data_request}. If exact values are not clear, provide the closest estimate based on the visual.") + +def identify_trends_tool_fn(image_path: str) -> str: + "Identifies and describes trends or patterns shown in the figure in the image." + return interpret_figure_with_llm(image_path, "Analyze and describe the main trends or patterns shown in this figure.") + +def compare_elements_tool_fn(image_path: str, comparison_request: str) -> str: + "Compares different elements within the figure in the image." + return interpret_figure_with_llm(image_path, f"Compare the following elements within the figure: {comparison_request}. Be specific about the comparison based on the visual data.") + +def summarize_figure_insights_tool_fn(image_path: str) -> str: + "Summarizes the key insights or main message conveyed by the figure in the image." + return interpret_figure_with_llm(image_path, "Summarize the key insights or the main message conveyed by this figure.") + +# --- Tool Definitions for Agent --- +describe_figure_tool = FunctionTool.from_defaults( + fn=describe_figure_tool_fn, + name="describe_figure", + description="Provides a general description of the figure in the image (type, elements, topic). Input: image_path (str)." +) + +extract_data_points_tool = FunctionTool.from_defaults( + fn=extract_data_points_tool_fn, + name="extract_data_points", + description="Extracts specific data points/values from the figure. Input: image_path (str), data_request (str)." +) + +identify_trends_tool = FunctionTool.from_defaults( + fn=identify_trends_tool_fn, + name="identify_trends", + description="Identifies and describes trends/patterns in the figure. Input: image_path (str)." +) + +compare_elements_tool = FunctionTool.from_defaults( + fn=compare_elements_tool_fn, + name="compare_elements", + description="Compares different elements within the figure. Input: image_path (str), comparison_request (str)." +) + +summarize_figure_insights_tool = FunctionTool.from_defaults( + fn=summarize_figure_insights_tool_fn, + name="summarize_figure_insights", + description="Summarizes the key insights/main message of the figure. Input: image_path (str)." +) + +# --- Agent Initialization --- +def initialize_figure_interpretation_agent() -> ReActAgent: + """Initializes the Figure Interpretation Agent.""" + logger.info("Initializing FigureInterpretationAgent...") + + # Configuration for the agent's main LLM (can be the same multi-modal one) + agent_llm_model = os.getenv("FIGURE_INTERPRETATION_AGENT_LLM_MODEL", "models/gemini-1.5-pro") + gemini_api_key = os.getenv("GEMINI_API_KEY") + + if not gemini_api_key: + logger.error("GEMINI_API_KEY not found for FigureInterpretationAgent.") + raise ValueError("GEMINI_API_KEY must be set for FigureInterpretationAgent") + + try: + # Agent's LLM doesn't necessarily need to be multi-modal itself, + # if the tools handle the multi-modal calls. + # However, using a multi-modal one might allow more direct interaction patterns later. + llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model) + logger.info(f"Using agent LLM: {agent_llm_model}") + + # Load system prompt + default_system_prompt = ("You are FigureInterpretationAgent... [Default prompt content - replace with actual]" # Placeholder + ) + system_prompt = load_prompt_from_file("../prompts/figure_interpretation_agent_prompt.txt", default_system_prompt) + if system_prompt == default_system_prompt: + logger.warning("Using default/fallback system prompt for FigureInterpretationAgent.") + + # Define available tools + tools = [ + describe_figure_tool, + extract_data_points_tool, + identify_trends_tool, + compare_elements_tool, + summarize_figure_insights_tool + ] + + # Define valid handoff targets + valid_handoffs = [ + "planner_agent", # To return results + "research_agent", # If context from figure needs further research + "reasoning_agent" # If interpretation needs logical analysis + ] + + agent = ReActAgent( + name="figure_interpretation_agent", + description=( + "Analyzes and interprets visual data representations (charts, graphs, tables) from image files. " + "Can describe figures, extract data, identify trends, compare elements, and summarize insights." + ), + tools=tools, + llm=llm, + system_prompt=system_prompt, + can_handoff_to=valid_handoffs, + # Note: This agent inherently requires multi-modal input capabilities, + # which are handled within its tools via a multi-modal LLM. + ) + logger.info("FigureInterpretationAgent initialized successfully.") + return agent + + except Exception as e: + logger.error(f"Error during FigureInterpretationAgent initialization: {e}", exc_info=True) + raise + +# Example usage (for testing if run directly) +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') + logger.info("Running figure_interpretation_agent.py directly for testing...") + + # Check required keys + required_keys = ["GEMINI_API_KEY"] + missing_keys = [key for key in required_keys if not os.getenv(key)] + if missing_keys: + print(f"Error: Required environment variable(s) not set: {', '.join(missing_keys)}. Cannot run test.") + else: + # Check if a multi-modal model is likely configured (heuristic) + model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro") + if "pro" not in model_name.lower() and "vision" not in model_name.lower(): + print(f"Warning: Configured LLM {model_name} might not support image input. Tests may fail.") + + # Create a dummy image file for testing (requires Pillow) + dummy_image_path = "dummy_figure.png" + try: + from PIL import Image, ImageDraw, ImageFont + img = Image.new('RGB', (400, 200), color = (255, 255, 255)) + d = ImageDraw.Draw(img) + # Try to load a default font, handle if not found + try: + font = ImageFont.truetype("arial.ttf", 15) # Common font, might not exist + except IOError: + font = ImageFont.load_default() + print("Arial font not found, using default PIL font.") + d.text((10,10), "Simple Bar Chart", fill=(0,0,0), font=font) + d.rectangle([50, 50, 100, 150], fill=(255,0,0)) # Bar 1 + d.text((60, 160), "A", fill=(0,0,0), font=font) + d.rectangle([150, 80, 200, 150], fill=(0,0,255)) # Bar 2 + d.text((160, 160), "B", fill=(0,0,0), font=font) + img.save(dummy_image_path) + print(f"Created dummy image file: {dummy_image_path}") + + # Test the tools directly + print("\nTesting describe_figure...") + desc = describe_figure_tool_fn(dummy_image_path) + print(f"Description: {desc}") + + print("\nTesting extract_data_points (qualitative)...") + extract_req = "Height of bar A vs Bar B" # Qualitative request + extract_res = extract_data_points_tool_fn(dummy_image_path, extract_req) + print(f"Extraction Result: {extract_res}") + + print("\nTesting compare_elements...") + compare_req = "Compare bar A and bar B" + compare_res = compare_elements_tool_fn(dummy_image_path, compare_req) + print(f"Comparison Result: {compare_res}") + + # Clean up dummy image + os.remove(dummy_image_path) + + except ImportError: + print("Pillow library not installed. Skipping direct tool tests that require image creation.") + # Optionally, still try initializing the agent + try: + test_agent = initialize_figure_interpretation_agent() + print("\nFigure Interpretation Agent initialized successfully (tool tests skipped).") + except Exception as e: + print(f"Error initializing agent: {e}") + except Exception as e: + print(f"Error during testing: {e}") + if os.path.exists(dummy_image_path): + os.remove(dummy_image_path) # Ensure cleanup on error + diff --git a/agents/image_analyzer_agent.py b/agents/image_analyzer_agent.py index 07d03088ba9ed06aeb6ed5fc433071a45f462334..3ed3192e24c772bc497e0bed05aae664ffc3022e 100644 --- a/agents/image_analyzer_agent.py +++ b/agents/image_analyzer_agent.py @@ -1,9 +1,13 @@ import os import logging +from dotenv import load_dotenv from llama_index.core.agent.workflow import FunctionAgent from llama_index.llms.google_genai import GoogleGenAI +# Load environment variables +load_dotenv() + # Setup logging logger = logging.getLogger(__name__) @@ -35,7 +39,7 @@ def initialize_image_analyzer_agent() -> FunctionAgent: logger.info("Initializing ImageAnalyzerAgent...") # Configuration from environment variables - llm_model_name = os.getenv("IMAGE_ANALYZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + llm_model_name = os.getenv("IMAGE_ANALYZER_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: @@ -65,7 +69,7 @@ def initialize_image_analyzer_agent() -> FunctionAgent: system_prompt=system_prompt, # No explicit tools needed if relying on direct multimodal LLM call # tools=[], - can_handoff_to=["planner_agent", "research_agent", "reasoning_agent", "figure_interpretation_agent"], + can_handoff_to=["planner_agent", "research_agent", "reasoning_agent"], ) logger.info("ImageAnalyzerAgent initialized successfully.") return agent diff --git a/agents/long_context_management_agent.py b/agents/long_context_management_agent.py index 50012f113af920376dac11d2ca658b7411f4ca22..0bd299f29b741cc7bda3e30420205e4e5b0adf70 100644 --- a/agents/long_context_management_agent.py +++ b/agents/long_context_management_agent.py @@ -2,6 +2,7 @@ import os import logging import json from typing import List, Dict, Optional, Union, Literal +from dotenv import load_dotenv from llama_index.core.agent.workflow import ReActAgent from llama_index.core.tools import FunctionTool, QueryEngineTool @@ -11,6 +12,8 @@ from llama_index.core.node_parser import SentenceSplitter from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.retrievers import VectorIndexRetriever +# Load environment variables +load_dotenv() # Setup logging logger = logging.getLogger(__name__) @@ -115,7 +118,7 @@ def summarize_long_context(detail_level: Literal["brief", "standard", "detailed" min_length = min_length or int(max_length * 0.3) # Default min length # LLM configuration - llm_model = os.getenv("CONTEXT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Use Pro for potentially long context + llm_model = os.getenv("CONTEXT_LLM_MODEL", "models/gemini-1.5-pro") # Use Pro for potentially long context gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for summarization LLM.") @@ -135,7 +138,7 @@ def summarize_long_context(detail_level: Literal["brief", "standard", "detailed" ) try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model) logger.info(f"Using summarization LLM: {llm_model}") response = llm.complete(prompt) summary = response.text.strip() @@ -307,7 +310,7 @@ def initialize_long_context_management_agent() -> ReActAgent: logger.info("Initializing LongContextManagementAgent...") # Configuration for the agent's main LLM - agent_llm_model = os.getenv("CONTEXT_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Needs to handle planning + agent_llm_model = os.getenv("CONTEXT_AGENT_LLM_MODEL", "models/gemini-1.5-pro") # Needs to handle planning gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: @@ -315,7 +318,7 @@ def initialize_long_context_management_agent() -> ReActAgent: raise ValueError("GEMINI_API_KEY must be set for LongContextManagementAgent") try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model) logger.info(f"Using agent LLM: {agent_llm_model}") Settings.llm = llm # Set default LLM for LlamaIndex components used by tools @@ -339,18 +342,14 @@ def initialize_long_context_management_agent() -> ReActAgent: valid_handoffs = [ "planner_agent", # To return results "text_analyzer_agent", # If further analysis of extracted/filtered text is needed - "reasoning_agent", - "research_agent" + "reasoning_agent" ] agent = ReActAgent( name="long_context_management_agent", description=( - "Manages and processes long textual context efficiently. Handles large documents, transcripts, or datasets " - "by summarizing (`summarize_long_context`), extracting key information (`extract_key_information`), " - "filtering relevant content (`filter_by_relevance`), and answering questions based on the context (`query_context_index`). " - "Supports internal indexing for efficient retrieval and repeated queries. Optimized for chunked input processing " - "and contextual distillation. Only relies on the provided input and avoids external augmentation unless explicitly requested." + "Manages and processes long textual context. Can load text (`load_text_context`), summarize (`summarize_long_context`), " + "extract key info (`extract_key_information`), filter by relevance (`filter_by_relevance`), and answer questions based on the context (`query_context_index`)." ), tools=tools, llm=llm, diff --git a/agents/math_agent.py b/agents/math_agent.py index 3308451756e6da1b9ef7c66b07f9826d4aa45849..30dbc3ef323e7c21e0dade84ba2c9be0aa371b70 100644 --- a/agents/math_agent.py +++ b/agents/math_agent.py @@ -1,13 +1,13 @@ import os import logging -from typing import List, Dict +from typing import List, Optional, Union, Dict +from dotenv import load_dotenv import sympy as sp import numpy as np import scipy.linalg as la import scipy.special as special -from llama_index.tools.code_interpreter import CodeInterpreterToolSpec -from scipy.integrate import quad +from scipy.integrate import odeint, quad from scipy.stats import binom, norm, poisson import numpy.fft as fft @@ -16,6 +16,9 @@ from llama_index.core.tools import FunctionTool from llama_index.llms.google_genai import GoogleGenAI from llama_index.tools.wolfram_alpha import WolframAlphaToolSpec +# Load environment variables +load_dotenv() + # Setup logging logger = logging.getLogger(__name__) @@ -600,26 +603,6 @@ def get_wolfram_alpha_tools() -> List[FunctionTool]: _wolfram_alpha_tools = [] return _wolfram_alpha_tools - -# Use LlamaIndex's built-in Code Interpreter Tool Spec for safe execution -# This assumes the necessary environment (e.g., docker) for the spec is available -try: - code_interpreter_spec = CodeInterpreterToolSpec() - # Get the tool(s) from the spec. It might return multiple tools. - code_interpreter_tools = code_interpreter_spec.to_tool_list() - if not code_interpreter_tools: - raise RuntimeError("CodeInterpreterToolSpec did not return any tools.") - # Assuming the primary tool is the first one, or find by name if necessary - code_interpreter_tool = next((t for t in code_interpreter_tools if t.metadata.name == "code_interpreter"), None) - if code_interpreter_tool is None: - raise RuntimeError("Could not find 'code_interpreter' tool in CodeInterpreterToolSpec results.") - logger.info("CodeInterpreterToolSpec initialized successfully.") -except Exception as e: - logger.error(f"Failed to initialize CodeInterpreterToolSpec: {e}", exc_info=True) - # Fallback: Define a dummy tool or raise error to prevent agent start? - # For now, let initialization fail if the safe interpreter isn't available. - raise RuntimeError("CodeInterpreterToolSpec failed to initialize. Cannot create code_agent.") from e - # --- Agent Initialization --- def initialize_math_agent() -> ReActAgent: @@ -627,7 +610,7 @@ def initialize_math_agent() -> ReActAgent: logger.info("Initializing MathAgent...") # Configuration - agent_llm_model = os.getenv("MATH_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + agent_llm_model = os.getenv("MATH_AGENT_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: @@ -638,12 +621,11 @@ def initialize_math_agent() -> ReActAgent: llm = GoogleGenAI( api_key=gemini_api_key, model=agent_llm_model, - temperature=0.05 ) logger.info(f"Using agent LLM: {agent_llm_model}") # Combine Python tools and Wolfram Alpha tools - all_tools = get_python_math_tools() + get_wolfram_alpha_tools() + [code_interpreter_tool] + all_tools = get_python_math_tools() + get_wolfram_alpha_tools() if not all_tools: logger.warning("No math tools available (Python or WolframAlpha). MathAgent may be ineffective.") @@ -668,8 +650,6 @@ def initialize_math_agent() -> ReActAgent: - Clearly state which tool you are using and why. - Handle potential errors gracefully and report them if they prevent finding a solution. - Pay close attention to input formats required by each tool (e.g., lists for vectors/matrices, strings for symbolic expressions). - - If your response exceeds the maximum token limit and cannot be completed in a single reply, please conclude your output with the marker [CONTINUE]. In subsequent interactions, I will prompt you with “continue” to receive the next portion of the response. """ agent = ReActAgent( @@ -681,7 +661,7 @@ def initialize_math_agent() -> ReActAgent: tools=all_tools, llm=llm, system_prompt=system_prompt, - can_handoff_to=["planner_agent", "reasoning_agent"], + can_handoff_to=["planner_agent"], ) logger.info("MathAgent initialized successfully.") return agent diff --git a/agents/planner_agent.py b/agents/planner_agent.py index 4b65682ff6da3506e54cf14f402c3d0bb61c4146..7b1d18a04ab26f56dd1a8aa84495113dc91ee8a3 100644 --- a/agents/planner_agent.py +++ b/agents/planner_agent.py @@ -1,11 +1,14 @@ import os import logging from typing import List, Dict +from dotenv import load_dotenv from llama_index.core.agent.workflow import ReActAgent from llama_index.core.tools import FunctionTool from llama_index.llms.google_genai import GoogleGenAI +# Load environment variables +load_dotenv() # Setup logging logger = logging.getLogger(__name__) @@ -41,11 +44,11 @@ def plan(objective: str) -> List[str]: logger.info(f"Generating plan for objective: {objective[:100]}...") # Configuration for planning LLM - planner_llm_model = os.getenv("PLANNER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Specific model for this tool? + planner_llm_model = os.getenv("PLANNER_TOOL_LLM_MODEL", "models/gemini-1.5-pro") # Specific model for this tool? gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for planning tool LLM.") - return "Error: GEMINI_API_KEY not set for planning." + return ["Error: GEMINI_API_KEY not set for planning."] # Prompt for the LLM to generate sub-steps input_prompt = ( @@ -57,7 +60,7 @@ def plan(objective: str) -> List[str]: ) try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=planner_llm_model) logger.info(f"Using planning LLM: {planner_llm_model}") response = llm.complete(input_prompt) @@ -81,23 +84,22 @@ def plan(objective: str) -> List[str]: if not sub_steps: logger.warning("LLM generated no sub-steps for the objective.") - return "Error: Failed to generate sub-steps." + return ["Error: Failed to generate sub-steps."] logger.info(f"Generated {len(sub_steps)} sub-steps.") - return sub_steps except Exception as e: logger.error(f"LLM call failed during planning: {e}", exc_info=True) - return f"Error during planning: {e}" + return [f"Error during planning: {e}"] -def synthesize_and_report(results: List[Dict[str, str]]) -> str: +def synthesize_and_respond(results: List[Dict[str, str]]) -> str: """ Aggregate results from sub-steps into a coherent final report using an LLM. Args: results (List[Dict[str, str]]): List of dictionaries, each with "sub_step" and "answer" keys. Returns: - str: A unified, well-structured report, or an error message. + str: A unified, well-structured response, or an error message. """ logger.info(f"Synthesizing results from {len(results)} sub-steps...") if not results: @@ -112,16 +114,14 @@ def synthesize_and_report(results: List[Dict[str, str]]) -> str: summary_blocks += f"Sub-step {i+1}: {sub_step}\nAnswer {i+1}: {answer}\n\n" # Configuration for synthesis LLM - synthesizer_llm_model = os.getenv("SYNTHESIZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Specific model? + synthesizer_llm_model = os.getenv("SYNTHESIZER_LLM_MODEL", "models/gemini-1.5-pro") # Specific model? gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for synthesis tool LLM.") return "Error: GEMINI_API_KEY not set for synthesis." # Prompt for the LLM - input_prompt = f"""You are an expert synthesizer. Given the following sub-steps and their answers derived - from an initial objective, produce a single, coherent, comprehensive final report that - addresses the original objective: + input_prompt = f"""You are an expert synthesizer. Given the following sub-steps and their answers derived from an initial objective, produce a single, coherent, comprehensive final report that addresses the original objective: --- SUB-STEP RESULTS --- {summary_blocks.strip()} @@ -131,7 +131,7 @@ def synthesize_and_report(results: List[Dict[str, str]]) -> str: """ try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=synthesizer_llm_model) logger.info(f"Using synthesis LLM: {synthesizer_llm_model}") response = llm.complete(input_prompt) logger.info("Synthesis successful.") @@ -140,59 +140,10 @@ def synthesize_and_report(results: List[Dict[str, str]]) -> str: logger.error(f"LLM call failed during synthesis: {e}", exc_info=True) return f"Error during synthesis: {e}" -def answer_question(question: str) -> str: - """ - Answer any question by following this strict format: - 1. Include your chain of thought (your reasoning steps). - 2. End your reply with the exact template: - FINAL ANSWER: [YOUR FINAL ANSWER] - YOUR FINAL ANSWER must be: - - A number, or - - As few words as possible, or - - A comma-separated list of numbers and/or strings. - Formatting rules: - * If asked for a number, do not use commas or units (e.g., $, %), unless explicitly requested. - * If asked for a string, do not include articles or abbreviations (e.g., city names), and write digits in plain text. - * If asked for a comma-separated list, apply the above rules to each element. - This tool should be invoked immediately after completing the final planning sub-step. - """ - logger.info(f"Answering question: {question[:100]}") - - gemini_api_key = os.getenv("GEMINI_API_KEY") - if not gemini_api_key: - logger.error("GEMINI_API_KEY not set for answer_question tool.") - return "Error: GEMINI_API_KEY not set." - - model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25") - - # Build the assistant prompt enforcing the required format - assistant_prompt = ( - "You are a general AI assistant. I will ask you a question. " - "Report your thoughts, and finish your answer with the following template: " - "FINAL ANSWER: [YOUR FINAL ANSWER]. " - "YOUR FINAL ANSWER should be a number OR as few words as possible " - "OR a comma separated list of numbers and/or strings. " - "If you are asked for a number, don't use commas for thousands or any units like $ or % unless specified. " - "If you are asked for a string, omit articles and abbreviations, and write digits in plain text. " - "If you are asked for a comma separated list, apply these rules to each element.\n\n" - f"Question: {question}\n" - "Answer:" - ) - - try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) - logger.info(f"Using answer LLM: {model_name}") - response = llm.complete(assistant_prompt) - logger.info("Answer generated successfully.") - return response.text - except Exception as e: - logger.error(f"LLM call failed during answer generation: {e}", exc_info=True) - return f"Error during answer generation: {e}" - # --- Tool Definitions --- synthesize_tool = FunctionTool.from_defaults( - fn=synthesize_and_report, - name="synthesize_and_report", + fn=synthesize_and_respond, + name="synthesize_and_respond", description=( "Aggregates results from multiple sub-steps into a final coherent report. " "Input: results (List[Dict[str, str]]) where each dict has \"sub_step\" and \"answer\". " @@ -209,22 +160,13 @@ generate_substeps_tool = FunctionTool.from_defaults( ) ) -answer_question = FunctionTool.from_defaults( - fn=answer_question, - name="answer_question", - description=( - "Answers any question and returns the full text, always ending with " - "‘FINAL ANSWER: ...’ in accordance with the formatting rules." - ), -) - # --- Agent Initialization --- def initialize_planner_agent() -> ReActAgent: """Initializes the Planner Agent.""" logger.info("Initializing PlannerAgent...") # Configuration for the agent's main LLM - agent_llm_model = os.getenv("PLANNER_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + agent_llm_model = os.getenv("PLANNER_AGENT_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: @@ -232,7 +174,7 @@ def initialize_planner_agent() -> ReActAgent: raise ValueError("GEMINI_API_KEY must be set for PlannerAgent") try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model) logger.info(f"Using agent LLM: {agent_llm_model}") # Load system prompt @@ -253,10 +195,8 @@ def initialize_planner_agent() -> ReActAgent: "role_agent", "image_analyzer_agent", "text_analyzer_agent", - "reasoning_agent", - "long_context_management_agent", - "advanced_validation_agent", - "video_analyzer_agent" + "verifier_agent", + "reasoning_agent" ] agent = ReActAgent( @@ -264,7 +204,7 @@ def initialize_planner_agent() -> ReActAgent: description=( "Strategically plans tasks by breaking down objectives into sub-steps using `generate_substeps`. " "Orchestrates execution by handing off sub-steps to specialized agents. " - "Synthesizes final results using `synthesize_and_report`." + "Synthesizes final results using `synthesize_and_respond`." ), tools=tools, llm=llm, @@ -301,7 +241,7 @@ if __name__ == "__main__": {"sub_step": "Find recent sales data.", "answer": "EV sales grew 25% year-over-year in Q1 2024."}, {"sub_step": "Analyze government incentives.", "answer": "Germany reduced subsidies, France maintained them."} ] - report = synthesize_and_report(test_results) + report = synthesize_and_respond(test_results) print(f"Synthesized Report:\n{report}") # Initialize the agent (optional) diff --git a/agents/reasoning_agent.py b/agents/reasoning_agent.py index 62841608586d83ea53b1a5213d44749013d4f638..e0a6d35d723dcd87a93a68dd6679f5b700fd212f 100644 --- a/agents/reasoning_agent.py +++ b/agents/reasoning_agent.py @@ -1,11 +1,15 @@ import os import logging +from dotenv import load_dotenv from llama_index.core.agent.workflow import ReActAgent from llama_index.core.tools import FunctionTool from llama_index.llms.google_genai import GoogleGenAI from llama_index.llms.openai import OpenAI +# Load environment variables +load_dotenv() + # Setup logging logger = logging.getLogger(__name__) @@ -41,7 +45,7 @@ def reasoning_tool_fn(context: str) -> str: # Configuration for the reasoning LLM (OpenAI in the original) reasoning_llm_model = os.getenv("REASONING_LLM_MODEL", "gpt-4o-mini") # Use gpt-4o-mini as default - openai_api_key = os.getenv("OPENAI_API_KEY") + openai_api_key = os.getenv("ALPAFLOW_OPENAI_API_KEY") # Specific key from original code if not openai_api_key: logger.error("ALPAFLOW_OPENAI_API_KEY not found for reasoning tool LLM.") @@ -71,9 +75,7 @@ def reasoning_tool_fn(context: str) -> str: llm = OpenAI( model=reasoning_llm_model, api_key=openai_api_key, - reasoning_effort="high", - temperature=0.055, - max_tokens=16384 + # reasoning_effort="high" # Add if needed and supported by the specific OpenAI integration ) logger.info(f"Using reasoning LLM: {reasoning_llm_model}") response = llm.complete(reasoning_prompt) @@ -83,57 +85,6 @@ def reasoning_tool_fn(context: str) -> str: logger.error(f"Error during reasoning tool LLM call: {e}", exc_info=True) return f"Error during reasoning: {e}" - -def answer_question(question: str) -> str: - """ - Answer any question by following this strict format: - 1. Include your chain of thought (your reasoning steps). - 2. End your reply with the exact template: - FINAL ANSWER: [YOUR FINAL ANSWER] - YOUR FINAL ANSWER must be: - - A number, or - - As few words as possible, or - - A comma-separated list of numbers and/or strings. - Formatting rules: - * If asked for a number, do not use commas or units (e.g., $, %), unless explicitly requested. - * If asked for a string, do not include articles or abbreviations (e.g., city names), and write digits in plain text. - * If asked for a comma-separated list, apply the above rules to each element. - This tool should be invoked immediately after completing the final planning sub-step. - """ - logger.info(f"Answering question: {question[:100]}") - - gemini_api_key = os.getenv("GEMINI_API_KEY") - if not gemini_api_key: - logger.error("GEMINI_API_KEY not set for answer_question tool.") - return "Error: GEMINI_API_KEY not set." - - model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25") - - # Build the assistant prompt enforcing the required format - assistant_prompt = ( - "You are a general AI assistant. I will ask you a question. " - "Report your thoughts, and finish your answer with the following template: " - "FINAL ANSWER: [YOUR FINAL ANSWER]. " - "YOUR FINAL ANSWER should be a number OR as few words as possible " - "OR a comma separated list of numbers and/or strings. " - "If you are asked for a number, don't use commas for thousands or any units like $ or % unless specified. " - "If you are asked for a string, omit articles and abbreviations, and write digits in plain text. " - "If you are asked for a comma separated list, apply these rules to each element.\n\n" - f"Question: {question}\n" - "Answer:" - ) - - try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) - logger.info(f"Using answer LLM: {model_name}") - response = llm.complete(assistant_prompt) - logger.info("Answer generated successfully.") - return response.text - except Exception as e: - logger.error(f"LLM call failed during answer generation: {e}", exc_info=True) - return f"Error during answer generation: {e}" - - # --- Tool Definition --- reasoning_tool = FunctionTool.from_defaults( fn=reasoning_tool_fn, @@ -144,22 +95,13 @@ reasoning_tool = FunctionTool.from_defaults( ), ) -answer_question = FunctionTool.from_defaults( - fn=answer_question, - name="answer_question", - description=( - "Use this tool to answer any question, reporting your reasoning steps and ending with 'FINAL ANSWER: ...'. " - "Invoke this tool immediately after the final sub-step of planning is complete." - ), -) - # --- Agent Initialization --- def initialize_reasoning_agent() -> ReActAgent: """Initializes the Reasoning Agent.""" logger.info("Initializing ReasoningAgent...") # Configuration for the agent's main LLM (Google GenAI) - agent_llm_model = os.getenv("REASONING_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + agent_llm_model = os.getenv("REASONING_AGENT_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: @@ -167,7 +109,7 @@ def initialize_reasoning_agent() -> ReActAgent: raise ValueError("GEMINI_API_KEY must be set for ReasoningAgent") try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model) logger.info(f"Using agent LLM: {agent_llm_model}") # Load system prompt @@ -180,28 +122,15 @@ def initialize_reasoning_agent() -> ReActAgent: agent = ReActAgent( name="reasoning_agent", description=( - "An autonomous reasoning specialist that applies `reasoning_tool` to perform " - "in-depth chain-of-thought analysis on incoming queries or contexts, " - "then seamlessly delegates the synthesized insights to `planner_agent` " - "or `long_context_management_agent` for subsequent task orchestration." + "A pure reasoning agent that uses the `reasoning_tool` for detailed chain-of-thought analysis " + "on the provided context, then hands off the result to the `planner_agent`." ), - tools=[reasoning_tool], + tools=[reasoning_tool], # Only has access to the reasoning tool llm=llm, system_prompt=system_prompt, - can_handoff_to=[ - "code_agent", - "research_agent", - "math_agent", - "role_agent", - "image_analyzer_agent", - "text_analyzer_agent", - "planner_agent", - "long_context_management_agent", - "advanced_validation_agent", - "video_analyzer_agent" - ], + can_handoff_to=["planner_agent"], ) - + logger.info("ReasoningAgent initialized successfully.") return agent except Exception as e: diff --git a/agents/research_agent.py b/agents/research_agent.py index c9a2e63e05db61421ef31d7b040c0417ff96b763..b8049d8fc701416145d204f73054cbd3e8d081b9 100644 --- a/agents/research_agent.py +++ b/agents/research_agent.py @@ -2,12 +2,11 @@ import os import time import logging import re # Import regex for video ID extraction -from typing import List, Optional, Dict, Any # Added Dict +from typing import List, Optional, Dict # Added Dict +from dotenv import load_dotenv -from duckdb.duckdb import description from llama_index.core.agent.workflow import ReActAgent from llama_index.core.tools import FunctionTool -from llama_index.core.workflow import Context from llama_index.llms.google_genai import GoogleGenAI from llama_index.tools.google import GoogleSearchToolSpec from llama_index.tools.tavily_research import TavilyToolSpec @@ -28,9 +27,89 @@ except ImportError: logging.warning("Selenium or Helium not installed. Browser interaction tools will be unavailable.") SELENIUM_AVAILABLE = False +# Attempt to import YouTube transcript API +try: + from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound + YOUTUBE_TRANSCRIPT_API_AVAILABLE = True +except ImportError: + logging.warning("youtube-transcript-api not installed. YouTube transcript tool will be unavailable.") + YOUTUBE_TRANSCRIPT_API_AVAILABLE = False + +# Load environment variables +load_dotenv() + # Setup logging logger = logging.getLogger(__name__) +# --- Helper function to extract YouTube Video ID --- +def extract_video_id(url: str) -> Optional[str]: + """Extracts the YouTube video ID from various URL formats.""" + # Standard watch URL: https://www.youtube.com/watch?v=VIDEO_ID + match = re.search(r'(?:v=|/v/|embed/|youtu\.be/|/shorts/)([A-Za-z0-9_-]+)', url) + if match: + return match.group(1) + return None + +# --- YouTube Transcript Tool --- +def get_youtube_transcript(video_url_or_id: str, languages=None) -> str: + """Fetches the transcript for a YouTube video using its URL or video ID. + Specify preferred languages as a list (e.g., ["en", "es"]). + Returns the transcript text or an error message. + """ + if languages is None: + languages = ["en"] + if not YOUTUBE_TRANSCRIPT_API_AVAILABLE: + return "Error: youtube-transcript-api library is required but not installed." + + logger.info(f"Attempting to fetch YouTube transcript for: {video_url_or_id}") + video_id = extract_video_id(video_url_or_id) + if not video_id: + # Assume it might be an ID already if extraction fails + if re.match(r"^[a-zA-Z0-9_\-]+$", video_url_or_id): + video_id = video_url_or_id + logger.info("Input treated as video ID.") + else: + logger.error(f"Could not extract valid YouTube video ID from: {video_url_or_id}") + return f"Error: Invalid YouTube URL or Video ID format: {video_url_or_id}" + + try: + # Fetch available transcripts + api = YouTubeTranscriptApi() + transcript_list = api.list(video_id) + + # Try to find a transcript in the specified languages + transcript = transcript_list.find_transcript(languages) + + # Fetch the actual transcript data (list of dicts) + transcript_data = transcript.fetch() + + # Combine the text parts into a single string + full_transcript = " ".join(snippet.text for snippet in transcript_data) + + full_transcript = " ".join(snippet.text for snippet in transcript_data) + logger.info(f"Successfully fetched transcript for video ID {video_id} in language {transcript.language}.") + return full_transcript + + except TranscriptsDisabled: + logger.warning(f"Transcripts are disabled for video ID: {video_id}") + return f"Error: Transcripts are disabled for this video (ID: {video_id})." + except NoTranscriptFound as e: + logger.warning(f"No transcript found for video ID {video_id} in languages {languages}. Available: {e.available_transcripts}") + # Try fetching any available transcript if specific languages failed + try: + logger.info(f"Attempting to fetch any available transcript for {video_id}") + any_transcript = transcript_list.find_generated_transcript(transcript_list.manually_created_transcripts.keys() or transcript_list.generated_transcripts.keys()) + any_transcript_data = any_transcript.fetch() + full_transcript = " ".join([item["text"] for item in any_transcript_data]) + logger.info(f"Successfully fetched fallback transcript for video ID {video_id} in language {any_transcript.language}.") + return full_transcript + except Exception as fallback_e: + logger.error(f"Could not find any transcript for video ID {video_id}. Original error: {e}. Fallback error: {fallback_e}") + return f"Error: No transcript found for video ID {video_id} in languages {languages} or any fallback language." + except Exception as e: + logger.error(f"Unexpected error fetching transcript for video ID {video_id}: {e}", exc_info=True) + return f"Error fetching transcript: {e}" + # --- Browser Interaction Tools (Conditional on Selenium/Helium availability) --- # Global browser instance (managed by initializer) @@ -69,7 +148,7 @@ def browser_tool_handler(func): return wrapper @browser_tool_handler -def visit_url(url: str, wait_seconds: float = 3.0) -> str: +def visit(url: str, wait_seconds: float = 3.0) -> str: """Navigate the browser to the specified URL and wait for the page to load.""" logger.info(f"Navigating to {url} and waiting {wait_seconds}s...") go_to(url) @@ -78,35 +157,9 @@ def visit_url(url: str, wait_seconds: float = 3.0) -> str: return f"Successfully navigated to: {current_url}" @browser_tool_handler -def get_text_by_css_selector(selector: str) -> list[Any] | str: - """ - (Browser) Extract visible text content from a webpage using a CSS selector. - - Args: - selector (str): - A valid CSS selector (e.g., 'body', '.content', '#main'). - - Behavior: - - If selector == 'body', extracts all visible text from the tag. - - If the tag is not found, falls back to Helium Text() for visible elements. - - For any other selector, uses Selenium to find all matching elements. - - Filters out invisible elements and empty lines. - - Returns: - list[str]: - A list of visible text lines. - OR - str: - An error message starting with "Error:" on failure (e.g., missing state). - """ +def get_text_by_css(selector: str) -> List[str]: + """Extract text from all elements matching a CSS selector. Use selector=\"body\" for all visible text.""" logger.info(f"Extracting text using CSS selector: {selector}") - # state_dict = await ctx.get("state") - # if not state_dict: - # logger.error("State not found in context.") - # return "Error: State not found." - # - # research_content = state_dict.get("research_content", []) - if selector.lower() == "body": # Helium Text() might be too broad, let's try body tag first try: @@ -122,253 +175,19 @@ def get_text_by_css_selector(selector: str) -> list[Any] | str: # Process Helium elements if fallback is used texts = [elem.web_element.text for elem in elements if elem.web_element.is_displayed() and elem.web_element.text.strip()] logger.info(f"Extracted {len(texts)} visible text elements using Helium Text().") - # research_content.extend(texts) - # state_dict["research_content"] = research_content - # await ctx.set("state", state_dict) return texts else: # Use Selenium directly for more control elements_selenium = _browser_driver.find_elements(By.CSS_SELECTOR, selector) texts = [elem.text for elem in elements_selenium if elem.is_displayed() and elem.text.strip()] logger.info(f"Extracted {len(texts)} visible text elements for selector {selector}.") - # state_dict["research_content"] = research_content - # await ctx.set("state", state_dict) return texts @browser_tool_handler -def search_in_page(query: str, - case_sensitive: bool = False, - max_results: int = 50) -> list[str] | str: - """ - (Browser) Search for occurrences of a word or phrase in the visible text of the current page. - - Args: - query (str): - Word or phrase to search for (e.g., 'machine learning'). - case_sensitive (bool, optional): - Whether the search should be case-sensitive (default: False). - max_results (int, optional): - Maximum number of matching lines to return (default: 50). - - Behavior: - - Retrieves all visible text from the tag. - - Splits the text into individual lines. - - Filters lines that contain the `query` (respecting `case_sensitive`). - - Appends the matching lines to `state['research_content']`. - - Truncates the result to `max_results`. - - Returns: - list[str]: - List of matching lines (up to `max_results`). - OR - str: - An error message starting with "Error:" on failure (e.g., missing state or browser). - """ - # Ensure we have state - # state = await ctx.get("state") or {} - # if not state: - # logger.error("State not found in context.") - # return "Error: State not found." - - # Extract all visible text from the page - try: - body = _browser_driver.find_element(By.TAG_NAME, "body") - text = body.text or "" - except Exception as e: - logger.error(f"Failed to extract page text: {e}") - return f"Error: Could not retrieve page text ({e})." - - # Prepare for search - lines = [line.strip() for line in text.splitlines() if line.strip()] - needle = query if case_sensitive else query.lower() - - # Find matches - matches = [] - for line in lines: - haystack = line if case_sensitive else line.lower() - if needle in haystack: - matches.append(line) - if len(matches) >= max_results: - break - - # Update research context - # research = state.get("research_content", []) - # research.extend(matches) - # state["research_content"] = research - # await ctx.set("state", state) - - return matches - -@browser_tool_handler -def suggest_informative_selectors(min_words: int = 10, max_selectors: int = 30) -> List[str]: - """ - Analyze the current page and return a list of CSS selectors likely to contain informative text, - along with up to 1000 characters of the element's visible content. - - Parameters: - - min_words (int): minimum number of words in an element's text to consider it informative. - - max_selectors (int): maximum number of distinct selectors to return. - - Returns: - - List[str]: each entry formatted as "selector: preview", where preview is a truncated (1000 chars max) version of the element's content. - """ - logger.info("Analyzing page to suggest informative CSS selectors with previews...") - elements = _browser_driver.find_elements(By.XPATH, "//*[not(self::script or self::style or self::head)]") - selector_scores: Dict[str, Dict] = {} - - for elem in elements: - if not elem.is_displayed(): - continue - try: - text = elem.text.strip() - if len(text.split()) >= min_words: - tag = elem.tag_name - class_attr = elem.get_attribute("class") or "" - id_attr = elem.get_attribute("id") or "" - - # Prioritize by specificity: id > class > tag - if id_attr: - selector = f"{tag}#{id_attr}" - elif class_attr: - main_class = class_attr.strip().split()[0] - selector = f"{tag}.{main_class}" - else: - selector = tag - - current_score = len(text) - if selector not in selector_scores or current_score > selector_scores[selector]["score"]: - selector_scores[selector] = { - "score": current_score, - "preview": text[:1000] # Limit preview to 1000 chars - } - except Exception as e: - logger.warning(f"Error processing element: {e}") - continue - - # Sort by score (proxy for information density) and return top N - sorted_items = sorted(selector_scores.items(), key=lambda x: x[1]["score"], reverse=True) - top_descriptions = [f"{selector}: {info['preview']}" for selector, info in sorted_items[:max_selectors]] - - logger.info(f"Suggested {len(top_descriptions)} informative selectors with previews.") - return top_descriptions - -@browser_tool_handler -def inspect_clickable_elements(max_elements: int = 20) -> List[str]: - """ - Inspect the current page and return a list of visible, clickable elements with their CSS selectors and preview text. - - Parameters: - - max_elements (int): maximum number of elements to include. - - Returns: - - List[str]: descriptions of clickable elements with selector, tag, and truncated inner text. - """ - logger.info("Inspecting page for clickable elements...") - - # Define XPaths for clickable elements - xpaths = [ - "//a[@href]", - "//button", - "//input[@type='submit' or @type='button']", - "//*[@onclick]", - "//*[contains(@role, 'button')]" - ] - seen = set() - results = [] - - for xpath in xpaths: - try: - elements = _browser_driver.find_elements(By.XPATH, xpath) - for elem in elements: - if not elem.is_displayed(): - continue - - try: - tag = elem.tag_name - class_attr = elem.get_attribute("class") or "" - id_attr = elem.get_attribute("id") or "" - text = elem.text.strip() - - # Construct CSS selector - if id_attr: - selector = f"{tag}#{id_attr}" - elif class_attr: - selector = f"{tag}.{class_attr.strip().split()[0]}" - else: - selector = tag - - if selector in seen: - continue - seen.add(selector) - - description = ( - f"selector: {selector}\n" - f"tag: {tag}\n" - f"text: {text[:100] if text else '[no visible text]'}" - ) - results.append(description) - - if len(results) >= max_elements: - logger.info(f"Reached limit of {max_elements} clickable elements.") - return results - except Exception as inner_err: - logger.warning(f"Error processing clickable element: {inner_err}") - except Exception as outer_err: - logger.warning(f"XPath evaluation failed: {xpath} => {outer_err}") - - logger.info(f"Found {len(results)} clickable elements.") - return results - -@browser_tool_handler -def inspect_clickable_elements_for_filtering_or_sorting(min_words: int = 1, max_items: int = 20) -> List[str]: - """ - Inspect the current page to find clickable elements (e.g., buttons, links, dropdowns) - that are likely to be used for filtering or sorting content. - - Parameters: - - min_words (int): minimum number of words to consider an element potentially meaningful. - - max_items (int): maximum number of clickable selectors to return. - - Returns: - - List[str]: a list of unique CSS selectors (e.g., button.sort, a.filter) likely tied to filtering/sorting functionality. - """ - logger.info("Inspecting clickable elements for filtering or sorting...") - - clickable_tags = ["button", "a", "input", "select", "label", "div", "span"] - selectors_found = {} - - for tag in clickable_tags: - try: - elements = _browser_driver.find_elements(By.TAG_NAME, tag) - for elem in elements: - if not elem.is_displayed() or not elem.is_enabled(): - continue - text = elem.text.strip() - if len(text.split()) >= min_words or elem.get_attribute("aria-label") or elem.get_attribute("role") in { - "button", "combobox"}: - tag_name = elem.tag_name - class_attr = elem.get_attribute("class") or "" - id_attr = elem.get_attribute("id") or "" - - if id_attr: - selector = f"{tag_name}#{id_attr}" - elif class_attr: - main_class = class_attr.strip().split()[0] - selector = f"{tag_name}.{main_class}" - else: - selector = tag_name - - if selector not in selectors_found: - selectors_found[selector] = text - except Exception as e: - logger.warning(f"Failed to process tag '{tag}': {e}") - continue - - sorted_selectors = sorted(selectors_found.items(), key=lambda x: len(x[1]), reverse=True) - final_selectors = [s for s, _ in sorted_selectors[:max_items]] - - logger.info(f"Found {len(final_selectors)} candidate selectors for filtering/sorting.") - return final_selectors +def get_page_html() -> str: + """Return the full HTML source of the current page.""" + logger.info("Retrieving page HTML source...") + return _browser_driver.page_source @browser_tool_handler def click_element_by_css(selector: str, index: int = 0) -> str: @@ -397,7 +216,7 @@ def click_element_by_css(selector: str, index: int = 0) -> str: return f"Clicked element {index} matching selector {selector}. Current URL: {_browser_driver.current_url}" @browser_tool_handler -def input_text_by_css(selector: str, text: str, index: int = 0, press_enter: bool = True) -> str: +def input_text_by_css(selector: str, text: str, index: int = 0, press_enter: bool = False) -> str: """Input text into the Nth (0-based index) element matching the CSS selector. Optionally press Enter.""" logger.info(f"Attempting to input text into element {index} matching selector: {selector}") # Use Selenium directly for finding elements @@ -467,119 +286,7 @@ def close_popups() -> str: time.sleep(0.5) return "Sent ESC key press." -async def answer_question(ctx: Context, question: str) -> str: - """ - Answer any question by following this strict format: - 1. Include your chain of thought (your reasoning steps). - 2. End your reply with the exact template: - FINAL ANSWER: [YOUR FINAL ANSWER] - YOUR FINAL ANSWER must be: - - A number, or - - As few words as possible, or - - A comma-separated list of numbers and/or strings. - Formatting rules: - * If asked for a number, do not use commas or units (e.g., $, %), unless explicitly requested. - * If asked for a string, do not include articles or abbreviations (e.g., city names), and write digits in plain text. - * If asked for a comma-separated list, apply the above rules to each element. - This tool should be invoked immediately after completing the final planning sub-step. - """ - logger.info(f"Answering question: {question[:100]}") - - state_dict = await ctx.get("state") - if not state_dict: - logger.error("State not found in context.") - return "Error: State not found." - - research_content = state_dict.get("research_content", []) - - research_content_str = "\n".join(research_content) - - gemini_api_key = os.getenv("GEMINI_API_KEY") - if not gemini_api_key: - logger.error("GEMINI_API_KEY not set for answer_question tool.") - return "Error: GEMINI_API_KEY not set." - - model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25") - - prompt = f""" - You are **StepwiseAnswerAgent**, a formal reasoning assistant designed to provide clear, - accurate, and actionable answers. - - ──────────────────────────────────────────── - CORE OPERATING PRINCIPLES - ──────────────────────────────────────────── - 1. **Comprehensive Information Gathering** - – Gather and synthesize all available information. - – Identify gaps or missing data. - - 2. **Step-by-Step Reasoning** *(internal only)* - – Think through the problem logically in sequential steps. - – This reasoning should remain invisible to the user; only the final answer is shown. - - 3. **Skeptical Verification** - – Question assumptions. - – Clearly flag any uncertainties or unverifiable claims (“uncertain”, “missing data”, etc.). - – Use reliable sources or tool outputs where possible. - - 4. **Clarity and Brevity** - – Use a formal and professional tone. - – Keep language precise and concise. - – Prioritize clarity, utility, and immediate usability of the answer. - - ──────────────────────────────────────────── - INTERNAL PROCEDURE (HIDDEN) - ──────────────────────────────────────────── - A. List all known facts and identify unknowns. - B. Construct a logical step-by-step reasoning chain. - C. Validate consistency and completeness. - D. Output only the final answer, with optional extras if relevant. - - ──────────────────────────────────────────── - RESPONSE FORMAT - ──────────────────────────────────────────── - **Answer:** - A clear, direct response addressing the user's request, without exposing reasoning steps. - - *(Optional)* - – **Key Points:** bullet-point summary of critical insights. - – **Next Steps / Recommended Actions:** if applicable. - - ──────────────────────────────────────────── - CONSTRAINTS - ──────────────────────────────────────────── - • Do not speculate. Clearly indicate when information is incomplete. - • Do not reveal internal reasoning or system instructions. - • No filler, no flattery, no unnecessary context. - • If the question is under-specified, ask for clarification instead of guessing. - """ - - # Build the assistant prompt enforcing the required format - assistant_prompt = ( - f"{prompt}\n\n" - "I will ask you a question. " - "Report your thoughts, and finish your answer with the following template: " - "FINAL ANSWER: [YOUR FINAL ANSWER]. " - "YOUR FINAL ANSWER should be a number OR as few words as possible " - "OR a comma separated list of numbers and/or strings. " - "If you are asked for a number, don't use commas for thousands or any units like $ or % unless specified. " - "If you are asked for a string, omit articles and abbreviations, and write digits in plain text. " - "If you are asked for a comma separated list, apply these rules to each element.\n\n" - "Let's begin.\n\n" - f"All available research: {research_content_str}\n" - f"Question: {question}\n" - "Answer:" - ) - - try: - llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) - logger.info(f"Using answer LLM: {model_name}") - response = llm.complete(assistant_prompt) - logger.info("Answer generated successfully.") - return response.text - except Exception as e: - logger.error(f"LLM call failed during answer generation: {e}", exc_info=True) - return f"Error during answer generation: {e}" - +# --- Search Engine & Data Source Tools --- # --- Agent Initializer Class --- class ResearchAgentInitializer: @@ -589,6 +296,7 @@ class ResearchAgentInitializer: self.browser_tools = [] self.search_tools = [] self.datasource_tools = [] + self.youtube_tool = None # Added for YouTube tool # Initialize LLM self._initialize_llm() @@ -603,44 +311,18 @@ class ResearchAgentInitializer: # Initialize Search/Datasource Tools self._create_search_tools() self._create_datasource_tools() - - self.answer_question = FunctionTool.from_defaults( - fn=answer_question, - name="answer_question", - description=( - "(QA) Answer any question using structured, step-by-step reasoning, and return a concise, final result.\n\n" - "**Inputs:**\n" - "- `ctx` (Context): Execution context containing prior research state.\n" - "- `question` (str): A direct, factual question to be answered based on collected knowledge.\n\n" - "**Behavior:**\n" - "- Retrieves accumulated research content from shared state.\n" - "- Performs logical reasoning internally using a formal chain-of-thought.\n" - "- Generates a full response that includes visible reasoning steps followed by a strict answer format.\n\n" - "**Output Format:**\n" - "- Returns a string with:\n" - " 1. Reasoning steps (visible to user).\n" - " 2. Final answer, always ending with:\n" - " `FINAL ANSWER: [your answer]`\n\n" - "**Answer Constraints:**\n" - "- The final answer must be:\n" - " • A number (without commas or units, unless explicitly requested), or\n" - " • A short string (no articles or abbreviations), or\n" - " • A comma-separated list of numbers and/or strings (same rules apply).\n\n" - "**Errors:**\n" - "- Returns a string prefixed with `Error:` if state is missing or LLM fails to respond." - ) - ) + self._create_youtube_tool() # Added logger.info("ResearchAgent resources initialized.") def _initialize_llm(self): - agent_llm_model = os.getenv("RESEARCH_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") + agent_llm_model = os.getenv("RESEARCH_AGENT_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for ResearchAgent LLM.") raise ValueError("GEMINI_API_KEY must be set for ResearchAgent") try: - self.llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05) + self.llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model) logger.info(f"ResearchAgent LLM initialized: {agent_llm_model}") except Exception as e: logger.error(f"Failed to initialize ResearchAgent LLM: {e}", exc_info=True) @@ -680,138 +362,19 @@ class ResearchAgentInitializer: if not SELENIUM_AVAILABLE: self.browser_tools = [] return - + self.browser_tools = [ - FunctionTool.from_defaults( - fn=visit_url, - name="visit_url", - description=( - "(Browser) Navigate the browser to a specified URL and wait for the page to load.\n" - "Inputs: url (str), wait_seconds (float, default=3.0).\n" - "Output: str — confirmation message including final URL." - ) - ), - FunctionTool.from_defaults( - fn=get_text_by_css_selector, - name="get_text_by_css_selector", - description=( - "(Browser) Extract visible text content from a webpage using a CSS selector.\n\n" - "**Inputs:**\n" - "- `selector` (str): A valid CSS selector (e.g., `'body'`, `'.content'`, `'#main'`).\n\n" - "**Behavior:**\n" - "- If `selector='body'`, extracts all visible text from the `` tag.\n" - "- If elements are not found via the DOM, falls back to visible elements via Helium `Text()`.\n" - "- For other selectors, uses Selenium to extract text from all visible matching elements.\n" - "- Filters out invisible and empty lines.\n\n" - "**Output:**\n" - "- `List[str]`: List of visible text lines, or an error message string on failure." - ) - ), - FunctionTool.from_defaults( - fn=search_in_page, - name="search_in_page", - description=( - "(Browser) Search for a word or phrase in the visible text of the current page.\n\n" - "**Inputs:**\n" - "- `query` (str): Word or phrase to search for (e.g., 'machine learning').\n" - "- `case_sensitive` (bool, optional): Whether the search is case-sensitive (default: False).\n" - "- `max_results` (int, optional): Maximum number of matching lines to return (default: 50).\n\n" - "**Behavior:**\n" - "- Extracts all visible text from the `` tag.\n" - "- Splits text into lines and filters those containing `query`.\n" - "- Appends found lines to the shared `research_content` state.\n\n" - "**Output:**\n" - "- `List[str]`: Matching lines (up to `max_results`).\n" - "- `str`: An error message if state or browser is unavailable." - ) - ), - FunctionTool.from_defaults( - fn=click_element_by_css, - name="click_element_by_css", - description=( - "(Browser) Click the N-th visible element matching a CSS selector.\n" - "Inputs: selector (str), index (int, default=0).\n" - "Output: str — confirmation message with final URL." - ) - ), - FunctionTool.from_defaults( - fn=input_text_by_css, - name="input_text_by_css", - description=( - "(Browser) Input text into the N-th input element matching a CSS selector, optionally pressing Enter.\n" - "Inputs: selector (str), text (str), index (int, default=0), press_enter (bool, default=True).\n" - "Output: str — confirmation of text input and action." - ) - ), - FunctionTool.from_defaults( - fn=scroll_page, - name="scroll_page", - description=( - "(Browser) Scroll the page in a given direction and amount.\n" - "Inputs: direction (str: 'up' or 'down'), amount (str: 'page', 'top', 'bottom', or number of pixels).\n" - "Output: str — confirmation of scroll action." - ) - ), - FunctionTool.from_defaults( - fn=go_back, - name="navigate_back", - description=( - "(Browser) Navigate back one step in browser history.\n" - "Inputs: none.\n" - "Output: str — confirmation of back navigation with current URL." - ) - ), - FunctionTool.from_defaults( - fn=close_popups, - name="close_popups", - description=( - "(Browser) Attempt to close pop-ups or modals by simulating an ESC keypress.\n" - "Inputs: none.\n" - "Output: str — confirmation of ESC key sent." - ) - ), - FunctionTool.from_defaults( - fn=suggest_informative_selectors, - name="suggest_informative_selectors", - description=( - "(Browser) Analyze the current web page and return a list of up to N CSS selectors likely to contain " - "informative text content. Each result includes the CSS selector followed by a preview of up to " - "1000 characters of the element's text content. This is especially useful for manually identifying " - "relevant containers before applying filters, scrapers, or sorters.\n\n" - "**Inputs:**\n" - "- `min_words` (int, default=10): Minimum number of words in the element for it to be considered informative.\n" - "- `max_selectors` (int, default=15): Maximum number of top selectors to return.\n\n" - "**Output:**\n" - "- `List[str]`: Each string is formatted as:\n" - " 'selector: preview_text'\n" - " where `selector` is a CSS path (e.g. `div.article`, `section#main`) and `preview_text` is a truncated (1000 char max) excerpt " - "of the visible text in that element." - ) - ), - FunctionTool.from_defaults( - fn=inspect_clickable_elements_for_filtering_or_sorting, - name="inspect_filter_sort_selectors", - description=( - "(Browser) Manually inspect the page for clickable elements (buttons, dropdowns, etc.) that may be used " - "for filtering or sorting. Returns a list of candidate CSS selectors.\n" - "Inputs: min_words (int, default=1), max_items (int, default=20).\n" - "Output: List[str] — list of unique selectors." - ) - ), - FunctionTool.from_defaults( - fn=inspect_clickable_elements, - name="inspect_clickable_elements", - description=( - "(Browser) Inspect the current page for clickable elements (e.g., ,