| | from dataclasses import dataclass |
| | from typing import List, Dict, Any, Optional |
| | import json |
| | import requests |
| | from bs4 import BeautifulSoup |
| | from openai import OpenAI |
| |
|
| | """ |
| | EXAMPLE OUTPUT: |
| | |
| | What is the current population for the city where Einstein was born? |
| | |
| | Step 1 |
| | ---------------------------------------- |
| | |
| | Executing: fetch_wiki_content |
| | Arguments: {'title': 'Albert Einstein'} |
| | |
| | Step 2 |
| | ---------------------------------------- |
| | |
| | Executing: deliver_answer |
| | Arguments: {'fields': ['Ulm, German Empire']} |
| | ANSWER FROM THE ASSISTANT: ['Ulm, German Empire'] |
| | |
| | Step 3 |
| | ---------------------------------------- |
| | |
| | Executing: fetch_wiki_content |
| | Arguments: {'title': 'Ulm'} |
| | |
| | Step 4 |
| | ---------------------------------------- |
| | |
| | Executing: deliver_answer |
| | Arguments: {'fields': ['128,928']} |
| | ANSWER FROM THE ASSISTANT: ['128,928'] |
| | |
| | Step 5 |
| | ---------------------------------------- |
| | Extraction Complete |
| | |
| | |
| | Why was Einstein famous? |
| | |
| | Step 1 |
| | ---------------------------------------- |
| | |
| | Executing: fetch_wiki_content |
| | Arguments: {'title': 'Albert Einstein'} |
| | |
| | Step 2 |
| | ---------------------------------------- |
| | |
| | Executing: deliver_answer |
| | Arguments: {'fields': ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.']} |
| | ANSWER FROM THE ASSISTANT: ['Best known for developing the theory of relativity, Einstein also made important contributions to quantum mechanics.', 'His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world\'s most famous equation."', 'He received the 1921 Nobel Prize in Physics.'] |
| | |
| | Step 3 |
| | ---------------------------------------- |
| | Extraction Complete |
| | """ |
| |
|
| | @dataclass |
| | class WikiConfig: |
| | """Configuration for OpenAI and Wikipedia settings""" |
| | api_key: str = "sk-123" |
| | api_base: str = "{info}/v1" |
| | model: Optional[str] = None |
| | max_steps: int = 5 |
| | wikipedia_base_url: str = "https://en.wikipedia.org/wiki/" |
| |
|
| | class WikiTools: |
| | """Collection of Wikipedia and extraction tools""" |
| |
|
| | def __init__(self, base_url: str): |
| | self.base_url = base_url |
| |
|
| | def fetch_wiki_content(self, title: str, section: Optional[str] = None) -> str: |
| | """Fetch and clean Wikipedia article content, optionally from a specific section""" |
| | url = f"{self.base_url}{title.replace(' ', '_')}" |
| | response = requests.get(url) |
| | soup = BeautifulSoup(response.content, 'html.parser') |
| |
|
| | |
| | for unwanted in soup.find_all(['script', 'style', 'footer', 'header']): |
| | unwanted.decompose() |
| |
|
| | if section: |
| | |
| | section_tag = soup.find('span', {'id': section}) |
| | if section_tag: |
| | content = section_tag.parent.find_next_siblings() |
| | text = ' '.join(tag.get_text() for tag in content) |
| | else: |
| | return "Section not found" |
| | else: |
| | |
| | content = soup.find(id='mw-content-text') |
| | if content: |
| | text = content.get_text() |
| | else: |
| | return "Content not found" |
| |
|
| | |
| | text = ' '.join(text.split()) |
| | return text[:8000] |
| |
|
| | @staticmethod |
| | def deliver_answer(fields: List[str]) -> Dict[str, Any]: |
| | """Extract specific information from text spans""" |
| | print (f"ANSWER FROM THE ASSISTANT: {fields}") |
| | return { |
| | "extracted_fields": "Provided fields was delivered to the user successfully." |
| | } |
| |
|
| | class ToolRegistry: |
| | """Registry of available tools and their schemas""" |
| |
|
| | def __init__(self, wiki_tools: WikiTools): |
| | self.wiki_tools = wiki_tools |
| |
|
| | @property |
| | def available_functions(self) -> Dict[str, callable]: |
| | return { |
| | "fetch_wiki_content": self.wiki_tools.fetch_wiki_content, |
| | "deliver_answer": self.wiki_tools.deliver_answer |
| | } |
| |
|
| | @property |
| | def tool_schemas(self) -> List[Dict[str, Any]]: |
| | return [ |
| | { |
| | "type": "function", |
| | "function": { |
| | "name": "fetch_wiki_content", |
| | "description": "Fetch content from a Wikipedia article", |
| | "parameters": { |
| | "type": "object", |
| | "properties": { |
| | "title": { |
| | "type": "string", |
| | "description": "The title of the Wikipedia article" |
| | }, |
| | "section": { |
| | "type": "string", |
| | "description": "Optional: Specific section ID to fetch", |
| | "optional": True |
| | } |
| | }, |
| | "required": ["title"] |
| | } |
| | } |
| | }, |
| | { |
| | "type": "function", |
| | "function": { |
| | "name": "deliver_answer", |
| | "description": "Extract specific information from the fetched text", |
| | "parameters": { |
| | "type": "object", |
| | "properties": { |
| | "fields": { |
| | "type": "array", |
| | "items": {"type": "string"}, |
| | "description": "List of text spans from the article that are relevant to the query" |
| | } |
| | }, |
| | "required": ["fields"] |
| | } |
| | } |
| | } |
| | ] |
| |
|
| | class WikiExtractionAgent: |
| | """Main agent class that handles the extraction process""" |
| |
|
| | def __init__(self, config: WikiConfig): |
| | self.config = config |
| | self.client = OpenAI(api_key=config.api_key, base_url=config.api_base) |
| | self.wiki_tools = WikiTools(config.wikipedia_base_url) |
| | self.tools = ToolRegistry(self.wiki_tools) |
| | self.messages = [{"system" : "1. First fetch any wikipedia pages you might need to answer the user query. Do not answer from parametric knowledge.\n\n2.Then, provide the answer to the user using the deliver_answer from the retrieved wikipedia page.\n\n3. You may need to issue multiple calls to wikipedia after extracting answers if there are nested dependencies for information."}] |
| |
|
| | if not config.model: |
| | models = self.client.models.list() |
| | self.config.model = models.data[0].id |
| |
|
| | def _serialize_tool_call(self, tool_call) -> Dict[str, Any]: |
| | """Convert tool call to serializable format""" |
| | return { |
| | "id": tool_call.id, |
| | "type": tool_call.type, |
| | "function": { |
| | "name": tool_call.function.name, |
| | "arguments": tool_call.function.arguments |
| | } |
| | } |
| |
|
| | def process_tool_calls(self, message) -> List[Dict[str, Any]]: |
| | """Process and execute tool calls from assistant""" |
| | results = [] |
| |
|
| | for tool_call in message.tool_calls: |
| | function_name = tool_call.function.name |
| | function_args = json.loads(tool_call.function.arguments) |
| |
|
| | print(f"\nExecuting: {function_name}") |
| | print(f"Arguments: {function_args}") |
| |
|
| | function_response = self.tools.available_functions[function_name](**function_args) |
| | results.append({ |
| | "tool": function_name, |
| | "args": function_args, |
| | "response": function_response |
| | }) |
| |
|
| | self.messages.append({ |
| | "role": "tool", |
| | "content": json.dumps(function_response), |
| | "tool_call_id": tool_call.id, |
| | "name": function_name |
| | }) |
| |
|
| | return results |
| |
|
| | def extract_information(self, query: str) -> List[Dict[str, Any]]: |
| | """Main method to handle the extraction process""" |
| | self.messages = [{ |
| | "role": "user", |
| | "content": f"""Extract information from Wikipedia to answer this query: {query} |
| | |
| | You can use these tools: |
| | 1. fetch_wiki_content: Get article content |
| | 2. deliver_answer: deliver relevant information |
| | |
| | Please fetch content first, and iterate as needed to get to the webpage with the correct answer and then deliver the relevant information.""" |
| | }] |
| |
|
| | all_results = [] |
| |
|
| | for step in range(self.config.max_steps): |
| | print(f"\nStep {step + 1}") |
| | print("-" * 40) |
| |
|
| | response = self.client.chat.completions.create( |
| | messages=self.messages, |
| | model=self.config.model, |
| | tools=self.tools.tool_schemas, |
| | temperature=0.0, |
| | ) |
| |
|
| | message = response.choices[0].message |
| |
|
| | if not message.tool_calls: |
| | print("Extraction Complete") |
| | break |
| |
|
| | self.messages.append({ |
| | "role": "assistant", |
| | "content": json.dumps(message.content), |
| | "tool_calls": [self._serialize_tool_call(tc) for tc in message.tool_calls] |
| | }) |
| |
|
| | results = self.process_tool_calls(message) |
| | all_results.extend(results) |
| |
|
| | return all_results |
| |
|
| | def main(): |
| | |
| | config = WikiConfig() |
| | agent = WikiExtractionAgent(config) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | results = agent.extract_information( |
| | query="""What is the current population for the city where Einstein was born?""" |
| | ) |
| | |
| | |
| | |
| | |
| | results = agent.extract_information( |
| | query="Why was Einstein famous?" |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|