Spaces:

namfam
/

TalkToData

Sleeping

App Files Files Community

Nam Fam commited on Jun 3, 2025

Commit

472e1d4

1 Parent(s): 0d52457

add files

Browse files

Files changed (26) hide show

.dockerignore +28 -0
.gitignore +67 -0
Dockerfile +29 -0
agents/__init__.py +7 -0
agents/dataframe_agent.py +39 -0
agents/llms.py +54 -0
agents/memories.py +0 -0
agents/safe_guardrails.py +195 -0
agents/sql_agent/__init__.py +14 -0
agents/sql_agent/agent.py +98 -0
agents/sql_agent/graph.py +103 -0
agents/sql_agent/nodes.py +496 -0
agents/sql_agent/prompts.py +0 -0
agents/sql_agent/states.py +30 -0
agents/tools.py +63 -0
app.py +508 -0
db/csv/category.csv +6 -0
db/csv/laptop.csv +6 -0
db/csv/product.csv +11 -0
db/csv/promotion.csv +7 -0
db/csv/smartphone.csv +11 -0
db/csv/tablet.csv +5 -0
db/sample_ecommerce.db +0 -0
pytest.ini +8 -0
requirements.txt +12 -0
utils/consts.py +12 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# Exclude Python cache & virtual envs
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+.env
+# Exclude Git files
+.git/
+.gitignore
+# IDE/editor files
+.vscode/
+.idea/
+# OS files
+.DS_Store
+Thumbs.db
+# Project-specific
+notebooks/
+scripts/
+tests/
+eval/
+mcp_server/
+mcp_client.py
+utils/google_api_manager.py
+output/

.gitignore ADDED Viewed

	@@ -0,0 +1,67 @@

+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+venv/
+ENV/
+env/
+env.bak/
+venv.bak/
+pip-log.txt
+pip-delete-this-directory.txt
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*.sublime-workspace
+*.sublime-project
+# Environment files
+.env
+.env.local
+.env.development
+.env.test
+.env.production
+# Local development
+# *.db
+*.sqlite
+data/
+logs/
+*.log
+# *.csv
+*.parquet
+# output/
+# Streamlit
+.streamlit/credentials.toml
+# /notebooks
+# /tests
+*conftest.py
+*test_plotting.py
+plots/
+utils/google_api_manager.py
+mcp_client.py
+mcp_server/
+tests/
+scripts/
+notebooks/
+output/

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# syntax=docker/dockerfile:1
+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+ && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Expose Streamlit default port
+EXPOSE 8501
+# # Streamlit configuration
+# ENV STREAMLIT_SERVER_PORT=8501 \
+#     STREAMLIT_SERVER_ADDRESS=0.0.0.0
+# # Launch the app
+# CMD ["streamlit", "run", "app.py"]
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .sql_agent.agent import SQLAgent
+from .sql_agent.states import SQLAgentState
+from .sql_agent.nodes import get_db_info, generate_sql, execute_sql, optional_plot, format_response , generate_answer
+from .tools import PlotSQLTool
+from .llms import LLM
+__all__ = ['SQLAgent', 'SQLAgentState', 'get_db_info', 'generate_sql', 'execute_sql', 'optional_plot', 'format_response', 'generate_answer', 'PlotSQLTool', 'LLM']

agents/dataframe_agent.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pandas as pd
+from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
+from agents.llms import LLM
+from langchain.agents.agent_types import AgentType
+def get_dataframe_agent(
+    df: pd.DataFrame,
+    verbose: bool = True,
+    allow_dangerous_code: bool = True,
+    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION
+):
+    """
+    Create a pandas DataFrame agent using the custom LLM.
+    Args:
+        df (pd.DataFrame): The pandas DataFrame to use.
+        verbose (bool): Whether to enable verbose output. Default is True.
+        allow_dangerous_code (bool): Whether to allow dangerous code execution. Default is True.
+        agent_type: The agent type to use. Default is ZERO_SHOT_REACT_DESCRIPTION.
+    Returns:
+        agent: The created DataFrame agent.
+    """
+    llm = LLM().chat_model
+    agent = create_pandas_dataframe_agent(
+        llm,
+        df,
+        agent_type=agent_type,
+        verbose=verbose,
+        allow_dangerous_code=allow_dangerous_code
+    )
+    return agent
+# Usage example:
+# import pandas as pd
+# from agents.dataframe_agent import get_dataframe_agent
+# df = pd.read_csv('your_file.csv')
+# agent = get_dataframe_agent(df)
+# response = agent.invoke('Your question here')
+# print(response)

agents/llms.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from langchain.chat_models import init_chat_model
+from langchain_core.messages import HumanMessage
+from dotenv import load_dotenv
+from typing import List
+from langchain.tools import BaseTool
+from langchain.agents import initialize_agent, AgentType
+_ = load_dotenv()
+class LLM:
+    def __init__(
+        self,
+        model: str = "gemini-2.0-flash",
+        model_provider: str = "google_genai",
+        temperature: float = 0.0,
+        max_tokens: int = 1000
+    ):
+        self.chat_model = init_chat_model(
+            model=model,
+            model_provider=model_provider,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+    def generate(self, prompt: str) -> str:
+        message = HumanMessage(content=prompt)
+        response = self.chat_model.invoke([message])
+        return response.content
+    def bind_tools(self, tools: List[BaseTool], agent_type: AgentType = AgentType.ZERO_SHOT_REACT_DESCRIPTION):
+        """
+        Bind LangChain tools to this model and return an AgentExecutor.
+        """
+        return initialize_agent(
+            tools,
+            self.chat_model,
+            agent=agent_type,
+            verbose=False
+        )
+    def set_temperature(self, temperature: float):
+        """
+        Set the temperature for the chat model.
+        """
+        self.chat_model.temperature = temperature
+    def set_max_tokens(self, max_tokens: int):
+        """
+        Set the maximum number of tokens for the chat model.
+        """
+        self.chat_model.max_tokens = max_tokens

agents/memories.py ADDED Viewed

File without changes

agents/safe_guardrails.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from typing import Dict
+from guardrails.validators import Validator, register_validator
+import sys
+import os
+import logging
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from agents.llms import LLM
+def setup_logger(name):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    logger.addHandler(handler)
+    return logger
+# @register_validator(name="medical_topic", data_type="string")
+# class MedicalTopicValidator(Validator):
+#     """Validates medical topics using project's LLM"""
+#     def __init__(self, threshold: float = 0.7, on_fail=None):
+#         super().__init__(on_fail=on_fail)
+#         self.threshold = threshold
+#         self.llm = LLM()
+#         self.logger = setup_logger('MedicalValidator')  # Use project's logging
+#     def validate(self, value: str, metadata: Dict = {}) -> str:
+#         prompt = f"""Medical relevance analysis (0-1 score then YES/NO):
+#         Question: {value}
+#         Medical context: {metadata.get('context', 'general health')}
+#         Score then Answer:"""
+#         try:
+#             response = self.llm.generate(prompt)
+#             last_line = response.strip().split('\n')[-1]
+#             parts = last_line.upper().strip().split()
+#             if len(parts) != 2:
+#                 raise ValueError(f"Malformed LLM response: '{last_line}'")
+#             score_str, answer = parts
+#             # Kiểm tra nếu phần điểm không phải là số
+#             if not score_str.replace('.', '', 1).isdigit() and '%' not in score_str:
+#                 raise ValueError(f"Invalid score format: '{score_str}'")
+#             # Convert score
+#             score = float(score_str.strip('%')) / 100 if '%' in score_str else float(score_str)
+#             if score < self.threshold or "YES" not in answer:
+#                 self.logger.warning(f"Rejected question: {value[:60]}")
+#                 raise ValueError("Non-medical question detected")
+#             return value
+#         except Exception as e:
+#             self.logger.error(f"Validation failed: {str(e)}")
+#             raise
+from typing import Callable, Dict, Optional
+from guardrails.validators import (
+    FailResult,
+    PassResult,
+    register_validator,
+    ValidationResult,
+    Validator,
+)
+from agents.llms import LLM
+TOXICITY_CHECK_PROMPT_TEMPLATE = """
+You are a helpful assistant.
+Respond with a number between 0 and 100 indicating the level of toxicity for the following text:
+{text}
+0 means no toxic language.
+100 means highly toxic language.
+"""
+@register_validator(name="toxic-language", data_type="string")
+class ToxicLanguage(Validator):
+    def __init__(self, threshold: int = 70, on_fail: Optional[Callable] = None):
+        super().__init__(on_fail=on_fail, threshold=threshold)
+        self._threshold = threshold
+        self.llm = LLM()
+        self.fix_value = "Sorry, I can't assist you with that request."
+    def _validate(self, value: str, metadata: Dict) -> ValidationResult:
+        prompt = TOXICITY_CHECK_PROMPT_TEMPLATE.format(text=value)
+        score = int(self.llm.generate(prompt).strip())
+        if score > self._threshold:
+            return FailResult(
+                error_message=f"Validation failed. Score {score} exceeds threshold of {self._threshold}.",
+                fix_value=self.fix_value,
+            )
+        else:
+            return PassResult()
+OFF_TOPIC_CHECK_PROMPT_TEMPLATE = """
+You are a helpful assistant.
+Respond with a number between 0 and 100 indicating how off-topic the following text is. Consider the context provided:
+Topic: '{topic}'
+Additional Context: '{additional_context}'
+Text: {text}
+Do not output prose.
+0 means very relevant to the topic.
+100 means completely off-topic.
+Please note that common greetings should not be considered off-topic.
+"""
+@register_validator(name="off-topic", data_type="string")
+class OffTopicValidator(Validator):
+    def __init__(self, threshold: int = 70, on_fail: Optional[Callable] = None):
+        super().__init__(on_fail=on_fail, threshold=threshold)
+        self._threshold = threshold
+        self.llm = LLM()
+    def _validate(self, value: str, metadata: Dict) -> ValidationResult:
+        topic = metadata.get('topic', 'general')
+        additional_context = metadata.get('additional_context', '')
+        if topic == 'general':
+            return PassResult()
+        # self.fix_value = f"Sorry, i can only assist you with questions related to the topic '{topic}'."
+        self.fix_value = "OFF_TOPIC"
+        prompt = OFF_TOPIC_CHECK_PROMPT_TEMPLATE.format(
+            text=value,
+            topic=topic,
+            additional_context=additional_context
+        )
+        score = int(self.llm.generate(prompt).strip())
+        print(f"Off-topic score: {score}")
+        if score > self._threshold:
+            return FailResult(
+                error_message=f"Validation failed. Score {score} exceeds threshold of {self._threshold}.",
+                fix_value=self.fix_value,
+            )
+        else:
+            return PassResult()
+if __name__ == "__main__":
+    # validator = OffTopicValidator()
+    # print("Validating:")
+    # result = validator.validate("What is the capital of France?", metadata={"topic": "Medical"})
+    # print("Validation result:", result)
+    from guardrails import Guard
+    guard = Guard().use(
+        # ToxicLanguage,
+        OffTopicValidator,
+        # on_fail=lambda value, fail_result: f"Sorry, I can't assist you with that request.",
+        # on_fail="exception"
+        on_fail="fix"
+    )
+    texts = [
+        "What is the capital of France?",
+        "I want to kill you.",
+        "You are a stupid dog",
+        "Triệu chứng của bệnh viêm dạ dày",
+    ]
+    metadata = {'topic': 'Medical'}
+    for text in texts:
+        print(f"Validating: {text}")
+        try:
+            validation_result = guard.validate(text, metadata=metadata)
+            print("Validation passed")
+            print("Validation result:", validation_result)
+            # response = guard.to_runnable().invoke(text)
+            # print("Response:", response)
+        except Exception as e:
+            print(f"Validation failed: {e}")
+        print('-' * 20)
+# Example usage
+# python agents/safe_guardrails.py

agents/sql_agent/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .agent import SQLAgent
+from .states import SQLAgentState
+from .nodes import get_db_info, generate_sql, execute_sql, optional_plot, format_response, generate_answer
+__all__ = [
+    'SQLAgent',
+    'SQLAgentState',
+    'get_db_info',
+    'generate_sql',
+    'execute_sql',
+    'optional_plot',
+    'format_response',
+    'generate_answer'
+]

agents/sql_agent/agent.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from agents.llms import LLM
+from dotenv import load_dotenv
+from langchain_community.utilities import SQLDatabase
+from utils.consts import DB_PATH
+from agents.sql_agent.states import SQLAgentState
+# Load environment vars
+load_dotenv()
+# def get_sql_agent():
+#     """
+#     Initializes a LangChain SQLDatabaseChain for SQLite.
+#     """
+#     # Load SQLite DB
+#     db = SQLDatabase.from_uri(f"sqlite:///{DB_PATH}")
+#     # Patch run to strip Markdown fences and log
+#     orig_run = db.run
+#     def clean_run(query: str, **kwargs) -> str:
+#         lines = query.splitlines()
+#         if lines and lines[0].strip().startswith("```"):
+#             lines = lines[1:]
+#         if lines and lines[-1].strip().startswith("```"):
+#             lines = lines[:-1]
+#         cleaned = "\n".join(lines).strip()
+#         print(f"[SQLDatabaseChain] Running SQL: {cleaned}")
+# def get_sql_agent():
+#     """
+#     Initializes a LangChain SQLDatabaseChain for SQLite.
+#     """
+#     # Load SQLite DB
+#     db = SQLDatabase.from_uri(f"sqlite:///{DB_PATH}")
+#     # Patch run to strip Markdown fences and log
+#     orig_run = db.run
+#     def clean_run(query: str, **kwargs) -> str:
+#         lines = query.splitlines()
+#         if lines and lines[0].strip().startswith("```"):
+#             lines = lines[1:]
+#         if lines and lines[-1].strip().startswith("```"):
+#             lines = lines[:-1]
+#         cleaned = "\n".join(lines).strip()
+#         print(f"[SQLDatabaseChain] Running SQL: {cleaned}")
+#         return orig_run(cleaned, **kwargs)
+#     db.run = clean_run
+#     # Initialize LLM
+#     llm_wrapper = LLM()
+#     # Create SQLDatabaseChain
+#     chain = SQLDatabaseChain.from_llm(llm_wrapper.chat_model, db, verbose=True)
+#     return chain
+class SQLAgent:
+    def __init__(self):
+        self.db = SQLDatabase.from_uri(f"sqlite:///{DB_PATH}")
+        self.llm = LLM()
+        self.graph = self.build_graph()
+    def build_graph(self):
+        from agents.sql_agent.graph import build_graph
+        return build_graph().compile()
+    def run(self, state: SQLAgentState) -> SQLAgentState:
+        """
+        Run the SQL agent with the given query.
+        """
+        return self.graph.invoke(state)
+if __name__ == "__main__":
+    agent = SQLAgent()
+    state = {
+        "question": None,
+        "db_info": {
+            "tables": [],
+            "columns": {},
+            "schema": None
+        },
+        "sql_query": None,
+        "sql_result": None,
+        "error": None
+    }
+    while True:
+        question = input("Enter your query (or 'exit' to quit): ")
+        state['question'] = question
+        if not question or question.lower() in ('exit', 'quit'):
+            print("Goodbye!")
+            break
+        result = agent.run(state)
+        # print(result)
+        # answer = result['answer']
+        # print(answer)
+        for step in agent.graph.stream(state, stream_mode="updates"):
+            print(step)

agents/sql_agent/graph.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from agents.sql_agent.states import SQLAgentState
+from langgraph.graph import StateGraph, START, END
+from agents.sql_agent.nodes import (
+    get_db_info,
+    generate_sql,
+    execute_sql,
+    generate_answer,
+    detect_off_topic,
+    choose_visualization,
+    format_data_for_visualization,
+    render_visualization,
+    finalize_output
+)
+def build_graph(visualize: bool = True) -> StateGraph:
+    graph = StateGraph(SQLAgentState)
+    # Add nodes
+    graph.add_node("detect_off_topic", detect_off_topic)
+    graph.add_node("generate_sql", generate_sql)
+    graph.add_node("get_db_info", get_db_info)
+    graph.add_node("execute_sql", execute_sql)
+    graph.add_node("generate_answer", generate_answer)
+    graph.add_node("choose_visualization", choose_visualization)
+    graph.add_node("format_data_for_visualization", format_data_for_visualization)
+    graph.add_node("render_visualization", render_visualization)
+    graph.add_node("finalize_output", finalize_output)
+    # Add edges
+    graph.add_edge(START, "detect_off_topic")
+    graph.add_conditional_edges(
+        "detect_off_topic",
+        lambda state: state['error'],
+        path_map={
+            # True: "generate_answer",
+            True: "get_db_info",
+            False: "get_db_info"
+        }
+    )
+    graph.add_edge("get_db_info", "generate_sql")
+    graph.add_edge("generate_sql", "execute_sql")
+    graph.add_edge("execute_sql", "choose_visualization")
+    graph.add_edge("choose_visualization", "format_data_for_visualization")
+    graph.add_edge("format_data_for_visualization", "render_visualization")
+    graph.add_edge("render_visualization", "generate_answer")
+    graph.add_edge("generate_answer", "finalize_output")
+    graph.add_edge("finalize_output", END)
+    # graph.add_edge("execute_sql", "generate_answer")
+    # graph.add_edge("generate_answer", "choose_visualization")
+    # graph.add_edge("choose_visualization", END)
+    if visualize:
+        # TODO: Implement visualization
+        pass
+    return graph
+def visualize_graph(graph) -> None:
+    graph.visualize()
+if __name__ == "__main__":
+    state = {
+        "question": "top 3 sản phẩm có giá thấp nhất",
+        "db_info": {
+            "tables": [],
+            "columns": {},
+            "schema": ""
+        },
+        "sql_query": "",
+        "sql_result": None,
+        "error": None,
+        "step": None,
+        "answer": None,
+        "plot_path": None,
+        "response_md": None,
+        "visualization": None,
+        "visualization_reason": None,
+        "formatted_data_for_visualization": None,
+        "visualization_output": None,
+        "off_topic": None
+    }
+    graph = build_graph().compile()
+    # visualize_graph(graph)
+    result = graph.invoke(state)
+    # print(result)
+    answer = result['answer']
+    print(answer)
+    for step in graph.stream(
+        state, stream_mode="updates"
+    ):
+        print("-" * 80)
+        # print(step['step'])
+        print(step)

agents/sql_agent/nodes.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import sqlite3
+import pandas as pd
+import re
+from agents.llms import LLM
+from agents.tools import PlotSQLTool
+from .states import SQLAgentState
+from utils.consts import DB_PATH
+def choose_visualization(state: SQLAgentState) -> SQLAgentState:
+    """Use LLM to suggest a suitable chart type for the SQL result."""
+    question = state['question']
+    sql_query = state['sql_query']
+    sql_result = state['sql_result']
+    # Convert sql_result DataFrame to markdown or string preview (or sample rows)
+    if sql_result is not None:
+        if hasattr(sql_result, 'head'):
+            preview = sql_result.head(5).to_markdown(index=False)
+        else:
+            preview = str(sql_result)
+    else:
+        preview = "No results"
+    prompt = f'''
+You are an AI assistant that recommends appropriate data visualizations. Based on the user's question, SQL query, and query results, suggest the most suitable type of graph or chart to visualize the data. If no visualization is appropriate, indicate that.
+Available chart types and their use cases:
+- Bar Graphs: Best for comparing categorical data or showing changes over time when categories are discrete and the number of categories is more than 2.
+- Horizontal Bar Graphs: Best for comparing categorical data or showing changes over time when the number of categories is small or the disparity between categories is large.
+- Scatter Plots: Useful for identifying relationships or correlations between two numerical variables or plotting distributions of data. Best used when both x axis and y axis are continuous.
+- Pie Charts: Ideal for showing proportions or percentages within a whole.
+- Line Graphs: Best for showing trends and distributions over time. Best used when both x axis and y axis are continuous or time-based.
+Provide your response in the following format:
+Recommended Visualization: [Chart type or "None"]. ONLY use the following names: bar, horizontal_bar, line, pie, scatter, none
+Reason: [Brief explanation for your recommendation]
+User question: {question}
+SQL query: {sql_query}
+Query results: {preview}
+Recommend a visualization:
+'''
+    llm = LLM()
+    response = llm.generate(prompt)
+    lines = response.split('\n')
+    visualization = 'none'
+    reason = ''
+    for line in lines:
+        if line.lower().startswith('recommended visualization:'):
+            visualization = line.split(':', 1)[1].strip()
+        elif line.lower().startswith('reason:'):
+            reason = line.split(':', 1)[1].strip()
+    state['visualization'] = visualization
+    state['visualization_reason'] = reason
+    state['step'] = 'choose_visualization'
+    return state
+def format_data_for_visualization(state: SQLAgentState) -> SQLAgentState:
+    """
+    Format the data for the chosen visualization type.
+    Hỗ trợ line, bar, scatter, grouped bar, fallback LLM cho các visualization khác.
+    """
+    import json
+    import pandas as pd
+    llm = LLM()
+    visualization = state.get('visualization', 'none')
+    sql_result = state.get('sql_result')
+    question = state.get('question')
+    sql_query = state.get('sql_query')
+    # Convert DataFrame to list of lists for processing
+    if sql_result is not None and hasattr(sql_result, 'values'):
+        data = sql_result.values.tolist()
+        columns = list(sql_result.columns)
+    elif isinstance(sql_result, list):
+        data = sql_result
+        columns = []
+    else:
+        state['formatted_data_for_visualization'] = None
+        return state
+    def _format_line_data(data, question):
+        if len(data[0]) == 2:
+            x_values = [str(row[0]) for row in data]
+            y_values = [float(row[1]) for row in data]
+            prompt = f"""
+You are a data labeling expert. Given a question and some data, provide a concise and relevant label for the data series.
+Question: {question}
+Data (first few rows): {data[:2]}
+Provide a concise label for this y axis.
+"""
+            label = llm.generate(prompt).strip()
+            formatted_data = {
+                "xValues": x_values,
+                "yValues": [
+                    {
+                        "data": y_values,
+                        "label": label
+                    }
+                ]
+            }
+            return formatted_data
+        elif len(data[0]) == 3:
+            data_by_label = {}
+            x_values = []
+            labels = list(set(item2 for item1, item2, item3 in data if isinstance(item2, str) and not item2.replace(".", "").isdigit() and "/" not in item2))
+            if not labels:
+                labels = list(set(item1 for item1, item2, item3 in data if isinstance(item1, str) and not item1.replace(".", "").isdigit() and "/" not in item1))
+            for item1, item2, item3 in data:
+                if isinstance(item1, str) and not item1.replace(".", "").isdigit() and "/" not in item1:
+                    label, x, y = item1, item2, item3
+                else:
+                    x, label, y = item1, item2, item3
+                if str(x) not in x_values:
+                    x_values.append(str(x))
+                if label not in data_by_label:
+                    data_by_label[label] = []
+                data_by_label[label].append(float(y))
+                for other_label in labels:
+                    if other_label != label:
+                        if other_label not in data_by_label:
+                            data_by_label[other_label] = []
+                        data_by_label[other_label].append(None)
+            y_values = [
+                {
+                    "data": data,
+                    "label": label
+                }
+                for label, data in data_by_label.items()
+            ]
+            formatted_data = {
+                "xValues": x_values,
+                "yValues": y_values,
+                "yAxisLabel": ""
+            }
+            prompt = f"""
+You are a data labeling expert. Given a question and some data, provide a concise and relevant label for the y-axis.
+Question: {question}
+Data (first few rows): {data[:2]}
+Provide a concise label for the y-axis.
+"""
+            y_axis_label = llm.generate(prompt).strip()
+            formatted_data["yAxisLabel"] = y_axis_label
+            return formatted_data
+        return None
+    def _format_scatter_data(data):
+        formatted_data = {"series": []}
+        if len(data[0]) == 2:
+            formatted_data["series"].append({
+                "data": [
+                    {"x": float(x), "y": float(y), "id": i+1}
+                    for i, (x, y) in enumerate(data)
+                ],
+                "label": "Data Points"
+            })
+        elif len(data[0]) == 3:
+            entities = {}
+            for item1, item2, item3 in data:
+                if isinstance(item1, str) and not item1.replace(".", "").isdigit() and "/" not in item1:
+                    label, x, y = item1, item2, item3
+                else:
+                    x, label, y = item1, item2, item3
+                if label not in entities:
+                    entities[label] = []
+                entities[label].append({"x": float(x), "y": float(y), "id": len(entities[label])+1})
+            for label, d in entities.items():
+                formatted_data["series"].append({
+                    "data": d,
+                    "label": label
+                })
+        else:
+            raise ValueError("Unexpected data format in results")
+        return formatted_data
+    def _format_bar_data(data, question):
+        if len(data[0]) == 2:
+            labels = [str(row[0]) for row in data]
+            values = [float(row[1]) for row in data]
+            prompt = f"""
+You are a data labeling expert. Given a question and some data, provide a concise and relevant label for the data series.
+Question: {question}
+Data (first few rows): {data[:2]}
+Provide a concise label for this y axis.
+"""
+            label = llm.generate(prompt).strip()
+            y_values = [{"data": values, "label": label}]
+        elif len(data[0]) == 3:
+            categories = set(row[1] for row in data)
+            labels = list(categories)
+            entities = set(row[0] for row in data)
+            y_values = []
+            for entity in entities:
+                entity_data = [float(row[2]) for row in data if row[0] == entity]
+                y_values.append({"data": entity_data, "label": str(entity)})
+        else:
+            raise ValueError("Unexpected data format in results")
+        formatted_data = {
+            "labels": labels,
+            "values": y_values
+        }
+        return formatted_data
+    def _format_other_visualizations(visualization, question, sql_query, data):
+        # Fallback: use LLM to format data
+        prompt = f"""
+You are a Data expert who formats data according to the required needs. You are given the question asked by the user, its sql query, the result of the query and the format you need to format it in.
+For the given question: {question}\n\nSQL query: {sql_query}\n\nResult: {data}\n\nFormat this data for visualization type: {visualization}. Just give the json string. Do not format it.
+"""
+        response = llm.generate(prompt)
+        try:
+            formatted_data_for_visualization = json.loads(response)
+            return formatted_data_for_visualization
+        except json.JSONDecodeError:
+            return {"error": "Failed to format data for visualization", "raw_response": response}
+    visualization_map = {
+        "none": lambda data: None,
+        "scatter": lambda data: _format_scatter_data(data),
+        "bar": lambda data, question: _format_bar_data(data, question),
+        "horizontal_bar": lambda data, question: _format_bar_data(data, question),
+        "line": lambda data, question: _format_line_data(data, question)
+    }
+    try:
+        state["formatted_data_for_visualization"] = visualization_map[visualization](data, question)
+    except (KeyError, Exception):
+        state["formatted_data_for_visualization"] = _format_other_visualizations(visualization, question, sql_query, data)
+    state['step'] = 'format_data_for_visualization'
+    return state
+def render_visualization(state: SQLAgentState) -> SQLAgentState:
+    """
+    Render the visualization from formatted data.
+    Output: path to saved image file.
+    """
+    import matplotlib.pyplot as plt
+    import os
+    from io import BytesIO
+    import uuid
+    data = state.get("formatted_data_for_visualization")
+    visualization = state.get("visualization", "none")
+    if not data:
+        state["visualization_output"] = None
+        return state
+    output_dir = "output/plots"
+    os.makedirs(output_dir, exist_ok=True)
+    def save_fig(fig):
+        file_path = os.path.join(output_dir, f"visualization_{uuid.uuid4().hex[:8]}.png")
+        fig.savefig(file_path, format="png", bbox_inches="tight")
+        plt.close(fig)
+        return file_path
+    def render_line(data):
+        fig, ax = plt.subplots()
+        x = data["xValues"]
+        for series in data["yValues"]:
+            ax.plot(x, series["data"], label=series["label"])
+        ax.set_xlabel("X")
+        ax.set_ylabel(data.get("yAxisLabel", "Y"))
+        ax.legend()
+        return save_fig(fig)
+    def render_bar(data, horizontal=False):
+        fig, ax = plt.subplots()
+        labels = data["labels"]
+        n_series = len(data["values"])
+        width = 0.8 / n_series
+        x_indexes = list(range(len(labels)))
+        for i, series in enumerate(data["values"]):
+            offset = (i - n_series / 2) * width + width / 2
+            if horizontal:
+                ax.barh(
+                    [x + offset for x in x_indexes],
+                    series["data"],
+                    height=width,
+                    label=series["label"]
+                )
+                ax.set_yticks(x_indexes)
+                ax.set_yticklabels(labels)
+            else:
+                ax.bar(
+                    [x + offset for x in x_indexes],
+                    series["data"],
+                    width=width,
+                    label=series["label"]
+                )
+                ax.set_xticks(x_indexes)
+                ax.set_xticklabels(labels, rotation=45, ha='right')
+        ax.legend()
+        return save_fig(fig)
+    def render_scatter(data):
+        fig, ax = plt.subplots()
+        for series in data["series"]:
+            xs = [point["x"] for point in series["data"]]
+            ys = [point["y"] for point in series["data"]]
+            ax.scatter(xs, ys, label=series["label"])
+        ax.set_xlabel("X")
+        ax.set_ylabel("Y")
+        ax.legend()
+        return save_fig(fig)
+    try:
+        if visualization == "line":
+            image_path = render_line(data)
+        elif visualization == "bar":
+            image_path = render_bar(data, horizontal=False)
+        elif visualization == "horizontal_bar":
+            image_path = render_bar(data, horizontal=True)
+        elif visualization == "scatter":
+            image_path = render_scatter(data)
+        else:
+            state["visualization_output"] = None
+            return state
+        state["visualization_output"] = image_path
+    except Exception as e:
+        state["visualization_output"] = None
+        state["error"] = f"Failed to render visualization: {str(e)}"
+    state["step"] = "render_visualization"
+    return state
+def finalize_output(state: SQLAgentState) -> SQLAgentState:
+    """
+    Node hợp nhất kết quả cuối cùng (answer, visualization_output, error, ...).
+    Hiện tại chỉ trả về state, có thể mở rộng xử lý sau.
+    """
+    state['step'] = 'finalize_output'
+    return state
+# def ingest(state: SQLAgentState) -> SQLAgentState:
+#     """Populate state.tables with list of tables in the DB."""
+#     db_info = state['db_info']
+#     conn = sqlite3.connect(DB_PATH)
+#     try:
+#         db_info['tables'] = [row[0] for row in conn.execute(
+#             "SELECT name FROM sqlite_master WHERE type='table';"
+#         )]
+#         # Populate columns for each table
+#         columns = {}
+#         for table in db_info['tables']:
+#             col_rows = conn.execute(f'PRAGMA table_info("{table}")').fetchall()
+#             columns[table] = [r[1] for r in col_rows]
+#         db_info['columns'] = columns
+#         state.db_info = db_info
+#     finally:
+#         conn.close()
+#     return state
+from agents.safe_guardrails import OffTopicValidator
+from guardrails import Guard
+def detect_off_topic(state: SQLAgentState) -> SQLAgentState:
+    """Check if the input question is off-topic."""
+    question = state['question']
+    validator = Guard().use(
+        OffTopicValidator,
+        on_fail="fix"
+    )
+    metadata = {
+        "topic": "Database Queries",
+        "additional_context": "The database is about ecommerce products with tables: products, laptops, phones, tablets, promotions, category"
+    }
+    validation_result = validator.validate(question, metadata=metadata)
+    if validation_result.validated_output == "OFF_TOPIC":
+        state['error'] = True
+    else:
+        state['error'] = False
+    state['step'] = 'detect_off_topic'
+    state['off_topic'] = validation_result.validated_output
+    print(state)
+    return state
+def get_db_info(state: SQLAgentState) -> SQLAgentState:
+    """Get database information."""
+    db_info = state['db_info']
+    conn = sqlite3.connect(DB_PATH)
+    try:
+        db_info['tables'] = [row[0] for row in conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table';"
+        )]
+        # Populate columns for each table
+        columns = {}
+        for table in db_info['tables']:
+            col_rows = conn.execute(f'PRAGMA table_info("{table}")').fetchall()
+            columns[table] = [r[1] for r in col_rows]
+        db_info['columns'] = columns
+        schema = "; ".join(f"{t}({', '.join(db_info['columns'][t])})" for t in db_info['tables'])
+        db_info['schema'] = schema
+    finally:
+        conn.close()
+    state['step'] = 'get_db_info'
+    return state
+def generate_sql(state: SQLAgentState) -> SQLAgentState:
+    """Use LLM to translate user_query into SQL."""
+    llm = LLM()
+    # Include detailed schema with columns
+    schema = state['db_info']['schema']
+    prompt = (
+        f"Given this database schema: {schema}, "
+        f"write an SQL query to: {state['question']}. "
+        "Respond with only the SQL enclosed in triple backticks."
+    )
+    raw = llm.generate(prompt)
+    # print('raw', raw)
+    lines = raw.splitlines()
+    if lines and lines[0].strip().startswith("```"):
+        lines = lines[1:]
+    if lines and lines[-1].strip().startswith("```"):
+        lines = lines[:-1]
+    state['sql_query'] = "\n".join(lines).strip()
+    state['step'] = 'generate_sql'
+    return state
+def execute_sql(state: SQLAgentState) -> SQLAgentState:
+    """Run the SQL in state.sql and store result DataFrame."""
+    sql_query = state['sql_query']
+    conn = sqlite3.connect(DB_PATH)
+    try:
+        state['sql_result'] = pd.read_sql_query(sql_query, conn)
+    except Exception as e:
+        state['error'] = str(e)
+    finally:
+        conn.close()
+    state['step'] = 'execute_sql'
+    return state
+def generate_answer(state: SQLAgentState) -> SQLAgentState:
+    """Generate answer using LLM based on SQL result."""
+    llm = LLM()
+    if state['sql_result'] is not None and not state['sql_result'].empty:
+        result_str = state['sql_result'].to_string(index=False)
+        prompt = (
+            f"Given the question: {state['question']},\n"
+            f"SQL Query: {state['sql_query']},\n"
+            f"and the following SQL query result: {result_str},\n"
+            "provide a concise answer:"
+        )
+        state['answer'] = llm.generate(prompt)
+    else:
+        state['error'] = state['error'] or "No results found."
+        if state["off_topic"] == "OFF_TOPIC":
+            state['error'] = "The question is off-topic."
+            state["answer"] = "Sorry, I can't assist you with that request."
+    state['step'] = 'generate_answer'
+    return state
+def optional_plot(state: SQLAgentState) -> SQLAgentState:
+    """If user_query requests plotting, generate plot and set state.plot_path."""
+    if any(k in state['question'].lower() for k in ['plot', 'vẽ', 'biểu đồ']):
+        tool = PlotSQLTool()
+        md = tool._run(state['sql_query'])
+        m = re.search(r'!\[.*\]\((.*?)\)', md)
+        if m:
+            state['plot_path'] = m.group(1)
+        else:
+            state['error'] = state['error'] or 'Plot generation failed'
+    return state
+def format_response(state: SQLAgentState) -> SQLAgentState:
+    """Build markdown response including SQL, table preview, and plot."""
+    parts = []
+    if state['sql_query']:
+        parts.append(f"```sql\n{state['sql_query']}\n```")
+    if state['sql_result'] is not None:
+        parts.append(state['sql_result'].to_markdown(index=False))
+    if state['plot_path']:
+        parts.append(f"![Plot]({state['plot_path']})")
+    if state['error']:
+        parts.append(f"**Error**: {state['error']}")
+    state['response_md'] = "\n\n".join(parts)
+    return state

agents/sql_agent/prompts.py ADDED Viewed

File without changes

agents/sql_agent/states.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from typing import List, Optional, Dict
+import pandas as pd
+from typing import TypedDict
+class SQLAgentState(TypedDict):
+    """
+    Carries context through the text→SQL and plotting pipeline:
+      - question: original NL input
+      - sql_query: generated SQL
+      - sql_result: raw query result DataFrame
+      - answer: final answer
+      - error: any error messages
+    """
+    question: str
+    db_info: dict
+    sql_query: str
+    sql_result: Optional[pd.DataFrame] = None
+    answer: str = ""
+    error: Optional[str] = None
+    plot_path: Optional[str] = None
+    response_md: str = ""
+    step: Optional[str] = None
+    visualization: Optional[str] = None
+    visualization_reason: Optional[str] = None
+    formatted_data_for_visualization: Optional[dict] = None
+    visualization_output: Optional[str] = None
+    off_topic: Optional[str] = None

agents/tools.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import sqlite3
+import pandas as pd
+from langchain.tools import BaseTool
+import os
+import matplotlib.pyplot as plt
+from utils.consts import DB_PATH, PLOTS_DIR
+# Fetch table list
+_conn = sqlite3.connect(DB_PATH)
+_TABLES = [row[0] for row in _conn.execute("SELECT name FROM sqlite_master WHERE type='table';")]
+_conn.close()
+_TABLES_LIST = ", ".join(_TABLES)
+class SQLiteQueryTool(BaseTool):
+    name: str = "sqlite_query"
+    description: str = f"Executes a SQL query against the ecommerce SQLite database and returns results as CSV. Available tables: {_TABLES_LIST}."
+    def _run(self, query: str) -> str:
+        print(f"[SQLiteQueryTool] Executing query: {query}")
+        conn = sqlite3.connect(DB_PATH)
+        try:
+            df = pd.read_sql_query(query, conn)
+            return df.to_csv(index=False)
+        except Exception as e:
+            return f"SQL Error: {e}"
+        finally:
+            conn.close()
+    async def _arun(self, query: str) -> str:
+        raise NotImplementedError("Async not supported for SQLiteQueryTool")
+class PlotSQLTool(BaseTool):
+    name: str = "plot_sql"
+    description: str = f"Executes a SQL query and generates a plot saved as a PNG; returns markdown image link. Available tables: {_TABLES_LIST}."
+    def _run(self, query: str) -> str:
+        print(f"[PlotSQLTool] Executing query: {query}")
+        conn = sqlite3.connect(DB_PATH)
+        try:
+            df = pd.read_sql_query(query, conn)
+            plt.figure()
+            df.plot(kind='bar' if df.shape[1] > 1 else 'line', legend=False)
+            timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"plot_{timestamp}.png"
+            # Save plot to configured output directory
+            output_dir = PLOTS_DIR
+            os.makedirs(output_dir, exist_ok=True)
+            filepath = os.path.join(output_dir, filename)
+            plt.tight_layout()
+            plt.savefig(filepath)
+            plt.close()
+            return f"![Plot]({filepath})"
+        except Exception as e:
+            return f"Plot Error: {e}"
+        finally:
+            conn.close()
+    async def _arun(self, query: str) -> str:
+        raise NotImplementedError("Async not supported for PlotSQLTool")

app.py ADDED Viewed

	@@ -0,0 +1,508 @@

+import streamlit as st
+import pandas as pd
+from utils.consts import DB_PATH
+import sqlite3
+import re
+import os
+from agents.sql_agent.agent import SQLAgent
+import time
+from agents.tools import PlotSQLTool
+from agents.dataframe_agent import get_dataframe_agent
+from datetime import datetime
+db_name = os.path.basename(DB_PATH)
+st.set_page_config(page_title="🔍 TalkToData", layout="wide", initial_sidebar_state="collapsed")
+# Loại bỏ title markdown để tránh hiển thị lặp lại
+# Sidebar for settings
+with st.sidebar:
+    st.header("ℹ️ About", anchor=None)
+    st.markdown("""
+**TalkToData** v0.1.0
+Your personal AI Data Analyst.
+""", unsafe_allow_html=True)
+# Initialize chat history
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+# Initialize SQL agent
+# agent = get_sql_agent()
+agent = SQLAgent()
+state = {
+    "question": None,
+    "db_info": {
+        "tables": [],
+        "columns": {},
+        "schema": None
+    },
+    "sql_query": None,
+    "sql_result": None,
+    "error": None,
+    "step": None,
+    "answer": None
+}
+# --- Upload Screen State ---
+if 'files_uploaded' not in st.session_state:
+    st.session_state['files_uploaded'] = False
+# TEMP: Bypass landing page
+st.session_state['files_uploaded'] = True
+if not st.session_state['files_uploaded']:
+    # CSS to center and enlarge only the welcome start button
+    st.markdown("""
+    <style>
+    .welcome .stButton { display: flex; justify-content: center; }
+    .welcome .stButton button { font-size:2.5rem !important; padding:1.25rem 2rem !important; }
+    </style>
+    """, unsafe_allow_html=True)
+    # Wrap welcome content to scope styling
+    st.markdown("<div class='welcome' style='max-width:600px;margin:auto;text-align:center;'>", unsafe_allow_html=True)
+    # Title and subtitle
+    st.markdown("""
+    <h1 style='text-align:center; margin-bottom:0;'>🔍 TalkToData</h1>
+    <h3 style='text-align:center; color:gray;'>Your Personal AI Data Analyst that instantly answers your data questions with clear insights and elegant visualizations.</h3>
+    """, unsafe_allow_html=True)
+    # Standalone welcome start button
+    if st.button("🚀 Explore now", key="start"):
+        st.session_state['files_uploaded'] = True
+        st.experimental_rerun()
+    # Close welcome wrapper
+    st.markdown("</div>", unsafe_allow_html=True)
+    st.divider()
+    # SaaS-style Features section
+    st.markdown("## Features")
+    feat_cols = st.columns(3)
+    feat_cols[0].markdown("### 🗣 Natural-Language Queries\nAsk your data without SQL knowledge.")
+    feat_cols[1].markdown("### 📊 Instant Visualizations\nGet charts from one command.")
+    feat_cols[2].markdown("### 🔒 Secure & Local\nYour data stays on your machine.")
+    st.divider()
+    # How It Works section
+    st.markdown("## How It Works")
+    step_cols = st.columns(3)
+    step_cols[0].markdown("#### 1️⃣ Upload\nUpload .db or CSV files.")
+    step_cols[1].markdown("#### 2️⃣ Chat\nInteract in natural language.")
+    step_cols[2].markdown("#### 3️⃣ Visualize\nSee results as tables or charts.")
+    st.divider()
+    # Use Cases
+    st.markdown("## Use Cases")
+    st.markdown("- \"Show me top 5 products by sales\" → Chart")
+    st.markdown("- \"List customers from 2020\" → Table")
+    st.divider()
+    # Testimonials
+    st.markdown("## Testimonials")
+    testi_cols = st.columns(2)
+    testi_cols[0].markdown("> \"TalkToData transformed our data workflow!\"  \n— Jane Doe, Data Analyst")
+    testi_cols[1].markdown("> \"The AI assistant is incredibly smart and fast.\"  \n— John Smith, Product Manager")
+    st.divider()
+    # Footer
+    st.markdown("2025 TalkToData. All rights reserved.")
+    st.markdown("<p style='text-align: center; color: gray;'>TalkToData v0.1.0 - Copyright 2025 by <a href='https://github.com/phamdinhkhanh'>Khanh Pham</a></p>", unsafe_allow_html=True)
+    st.html(
+        "<p><span style='text-decoration: line-through double red;'>Oops</span>!</p>"
+    )
+    st.divider()
+else:
+    # App title and return button
+    # st.title("🔍 TalkToData")
+    st.markdown("### TalkToData")
+    # TEMP: Commented out back-to-home
+    # if st.button('⬅️ Back to Home', key='back_to_upload'):
+    #     st.session_state['files_uploaded'] = False
+    #     # Xóa dữ liệu cũ
+    #     if 'uploaded_csvs' in st.session_state:
+    #         st.session_state['uploaded_csvs'] = []
+    #     st.experimental_rerun()
+    # Layout: Data source selector, main content, and chat
+    data_col, left_col, right_col = st.columns([1.5, 3, 2])
+    # Data source selection
+    with data_col:
+        # st.subheader("Data Sources")
+        # Upload data
+        with st.expander("**Upload Data**", expanded=True):
+            st.file_uploader('Select SQLite (.db), CSV or Excel (.xlsx) files',
+                             type=['db', 'csv', 'xlsx'],
+                             accept_multiple_files=True,
+                             key='upload_any_col',
+                             label_visibility="collapsed")
+            gsheet_url = st.text_input('Enter Google Sheets URL (optional)', '', key='gsheet_url')
+            upload_status = []
+            has_db = False
+            has_csv = False
+            # Retrieve uploaded files list safely
+            uploaded_files = st.session_state.get('upload_any_col', [])
+            # Process Google Sheets if URL provided
+            url = st.session_state.get('gsheet_url', '').strip()
+            if url:
+                try:
+                    csv_url = url.replace('/edit#gid=', '/export?format=csv&gid=')
+                    df_gs = pd.read_csv(csv_url)
+                    if 'uploaded_csvs' not in st.session_state:
+                        st.session_state['uploaded_csvs'] = []
+                    st.session_state['uploaded_csvs'].append({'name': 'GoogleSheets', 'df': df_gs})
+                    upload_status.append('✅ Google Sheets loaded')
+                    has_csv = True
+                except Exception as e:
+                    upload_status.append(f'❌ Google Sheets error: {e}')
+            # Process files
+            for f in uploaded_files:
+                if f.name.lower().endswith('.db'):
+                    try:
+                        with open(DB_PATH, "wb") as dbf:
+                            dbf.write(f.read())
+                        upload_status.append(f"✅ Database: {f.name}")
+                        has_db = True
+                    except Exception as e:
+                        upload_status.append(f"❌ Database error: {e}")
+                # Process CSV and Excel
+                name = f.name.lower()
+                if name.endswith('.csv') or name.endswith('.xlsx'):
+                    try:
+                        if name.endswith('.xlsx'):
+                            # Process each sheet in Excel
+                            f.seek(0)
+                            xls = pd.ExcelFile(f)
+                            sheets = st.multiselect(f"Select sheet(s) from {f.name}", xls.sheet_names, default=xls.sheet_names)
+                            for sheet in sheets:
+                                # Read raw to detect header rows
+                                raw = xls.parse(sheet, header=None)
+                                nn = raw.notnull().sum(axis=1)
+                                hdr = [i for i, cnt in enumerate(nn) if cnt > 1]
+                                if len(hdr) >= 2:
+                                    header = hdr[:2]
+                                elif len(hdr) == 1:
+                                    header = [hdr[0]]
+                                else:
+                                    header = [0]
+                                df_sheet = xls.parse(sheet, header=header)
+                                # Flatten MultiIndex if needed
+                                if isinstance(df_sheet.columns, pd.MultiIndex):
+                                    df_sheet.columns = [" ".join([str(x) for x in col if pd.notna(x)]).strip() for col in df_sheet.columns]
+                                # Store with sheet label
+                                sheet_key = f"{f.name}:{sheet}"
+                                if 'uploaded_csvs' not in st.session_state:
+                                    st.session_state['uploaded_csvs'] = []
+                                st.session_state['uploaded_csvs'].append({'name': sheet_key, 'df': df_sheet})
+                                upload_status.append(f"✅ Excel: {sheet_key}")
+                        else:
+                            temp_df = pd.read_csv(f)
+                        if 'uploaded_csvs' not in st.session_state:
+                            st.session_state['uploaded_csvs'] = []
+                        # Check existing and update
+                        csv_exists = False
+                        for i, csv in enumerate(st.session_state['uploaded_csvs']):
+                            if csv['name'] == f.name:
+                                st.session_state['uploaded_csvs'][i]['df'] = temp_df
+                                csv_exists = True
+                                break
+                        if not csv_exists:
+                            st.session_state['uploaded_csvs'].append({'name': f.name, 'df': temp_df})
+                        upload_status.append(f"✅ CSV/Excel: {f.name}")
+                        has_csv = True
+                    except Exception as e:
+                        upload_status.append(f"❌ CSV/Excel error: {e}")
+            # Hiển thị trạng thái upload
+            if upload_status:
+                for status in upload_status:
+                    st.write(status)
+        # After upload, select data sources
+        ds = []
+        if os.path.exists(DB_PATH) and os.path.getsize(DB_PATH) > 0:
+            ds.append(db_name)
+        if 'uploaded_csvs' in st.session_state:
+            ds += [csv['name'] for csv in st.session_state['uploaded_csvs']]
+        if ds:
+            # Initialize selected_sources session state to default to db_name
+            if 'selected_sources' not in st.session_state:
+                st.session_state['selected_sources'] = [db_name] if db_name in ds else []
+            selected_sources = st.multiselect(
+                "**Select sources**", options=ds,
+                key='selected_sources'
+            )
+        else:
+            st.info("Upload a database or CSV/Excel file to select a data source.")
+    with left_col:
+        # Data Preview: filter sources by user selection
+        selected = st.session_state.get('selected_sources', [])
+        preview_db = os.path.exists(DB_PATH) and db_name in selected
+        # Filter CSV/Excel previews
+        preview_csvs = [csv for csv in st.session_state.get('uploaded_csvs', []) if csv['name'] in selected]
+        if preview_db or preview_csvs:
+            # Display previews
+            with st.container(height=415):
+                st.markdown("**Data Preview**")
+                # Build tab labels
+                tab_labels = []
+                if preview_db:
+                    tab_labels.append(db_name)
+                for c in preview_csvs:
+                    tab_labels.append(c['name'])
+                tabs = st.tabs(tab_labels)
+                idx = 0
+                # Database preview
+                if preview_db:
+                    with tabs[idx]:
+                        conn = sqlite3.connect(DB_PATH)
+                        tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
+                        if tables:
+                            t_tabs = st.tabs([t[0] for t in tables])
+                            for t, tab in zip(tables, t_tabs):
+                                with tab:
+                                    st.table(pd.read_sql_query(f"SELECT * FROM {t[0]}", conn))
+                        else:
+                            st.info("No tables found.")
+                        conn.close()
+                    idx += 1
+                # CSV/Excel previews
+                for c in preview_csvs:
+                    with tabs[idx]:
+                        st.table(c['df'])
+                    idx += 1
+        # --- Data Exploration Section (Always Visible) ---
+        with st.container(height=225):
+            # Data Exploration: only support Database source
+            selected = st.session_state.get('selected_sources', [])
+            if db_name not in selected:
+                st.warning(f"⚠️ Data Exploration only supports SQL queries on database .db files. Please select at least a database to continue.")
+            else:
+                # st.subheader("Data Exploration")
+                sql_explore = st.text_area(
+                    "Enter SQL query to explore:",
+                    value=st.session_state.get('explore_sql', ''),
+                    height=100,
+                    key='explore_sql'
+                )
+                if st.button("Run Query", key="explore_run"):
+                    try:
+                        df_explore = pd.read_sql_query(sql_explore, sqlite3.connect(DB_PATH))
+                        st.session_state['explore_result'] = df_explore
+                        # Record exploration history
+                        if 'explore_history' not in st.session_state:
+                            st.session_state['explore_history'] = []
+                        # User query
+                        st.session_state['explore_history'].append({
+                            'source': 'explore', 'role': 'user', 'content': sql_explore, 'timestamp': datetime.now()
+                        })
+                        # Assistant result as CSV
+                        res_str = df_explore.to_csv(index=False)
+                        st.session_state['explore_history'].append({
+                            'source': 'explore', 'role': 'assistant', 'content': res_str, 'timestamp': datetime.now()
+                        })
+                    except Exception as e:
+                        st.error(f"Error: {e}")
+        # Wrap tabs in scrollable container
+        with st.container(height=300):
+            # st.markdown("<div style='height:300px; overflow:auto'>", unsafe_allow_html=True)
+            tabs = st.tabs(["Results", "History"])
+            # Results tab: show explore_result only
+            with tabs[0]:
+                if 'explore_result' in st.session_state:
+                    # st.subheader("Results")
+                    st.table(st.session_state['explore_result'])
+                else:
+                    st.write("No results yet.")
+            # History tab: Query history
+            with tabs[1]:
+                # st.subheader("History")
+                # Build paired history entries
+                combined = []
+                # Exploration history pairs
+                explore_hist = st.session_state.get('explore_history', [])
+                for i in range(0, len(explore_hist), 2):
+                    u = explore_hist[i] if i < len(explore_hist) else {}
+                    a = explore_hist[i+1] if i+1 < len(explore_hist) else {}
+                    combined.append({
+                        'source': db_name,
+                        'query_type': 'sql',
+                        'query': u.get('content'),
+                        'result': a.get('content'),
+                        'timestamp': u.get('timestamp')
+                    })
+                # Chat history pairs for all sources
+                for source, chat_hist in st.session_state.get('chat_histories', {}).items():
+                    for idx in range(len(chat_hist)):
+                        if chat_hist[idx].get('role') == 'user':
+                            q = chat_hist[idx].get('content')
+                            r = chat_hist[idx+1].get('content') if idx+1 < len(chat_hist) else None
+                            combined.append({
+                                'source': source,
+                                'query_type': 'chat',
+                                'query': q,
+                                'result': r,
+                                'timestamp': chat_hist[idx].get('timestamp')
+                            })
+                if combined:
+                    df_history = pd.DataFrame(combined)
+                    # ensure timestamp column is datetime
+                    if not pd.api.types.is_datetime64_any_dtype(df_history['timestamp']):
+                        df_history['timestamp'] = pd.to_datetime(df_history['timestamp'])
+                    # sort latest first
+                    df_history = df_history.sort_values('timestamp', ascending=False)
+                    st.table(df_history)
+                else:
+                    st.write("No history yet.")
+            st.markdown("</div>", unsafe_allow_html=True)
+    with right_col:
+        # Use selected_sources from left data selector
+        data_sources = st.session_state.get('selected_sources', [])
+        csv_files = st.session_state.get('uploaded_csvs', [])
+        selected_source = data_sources[0] if data_sources else None
+        # Chat history per source (only if a source is selected)
+        if 'chat_histories' not in st.session_state:
+            st.session_state['chat_histories'] = {}
+        # Initialize past conversations container
+        if 'all_conversations' not in st.session_state:
+            st.session_state['all_conversations'] = {}
+        # Only proceed with chat if a data source is selected
+        if selected_source is not None:
+            if selected_source not in st.session_state['chat_histories']:
+                st.session_state['chat_histories'][selected_source] = []
+            if selected_source not in st.session_state['all_conversations']:
+                st.session_state['all_conversations'][selected_source] = []
+            chat_history = st.session_state['chat_histories'][selected_source]
+        # Only show chat interface if a data source is selected
+        if selected_source is not None:
+            container = st.container(height=700, border=True)
+            # Align New Conversation button top-right
+            with container:
+                cols = st.columns([2, 1])
+                with cols[0]:
+                    st.markdown("**Ask TalkToData**")
+                if cols[1].button("New Chat", key=f"new_conv_{selected_source}"):
+                    if chat_history:
+                        conv = chat_history.copy()
+                        ts = conv[0].get('timestamp', datetime.now())
+                        st.session_state['all_conversations'][selected_source].append({'messages':conv, 'timestamp':ts})
+                        st.session_state['chat_histories'][selected_source] = []
+                        st.experimental_rerun()
+            # Display chat messages
+            chat_history = st.session_state['chat_histories'][selected_source]
+            # Welcome message for new chat
+            if not chat_history:
+                container.chat_message("assistant").write("👋 Hello! Welcome to TalkToData. Ask any question about your data to get started.")
+            for turn in chat_history:
+                role = turn.get('role', '')
+                content = turn.get('content', '')
+                if role == 'user':
+                    container.chat_message("user").write(content)
+                else:
+                    container.chat_message("assistant").write(content)
+            # Chat input
+            user_input = st.chat_input(f"Ask a question about {selected_source}...")
+        else:
+            # Placeholder to maintain layout
+            st.container(height=700, border=True)
+            user_input = None
+        if user_input:
+            chat_history.append({"role": "user", "content": user_input, "timestamp": datetime.now()})
+            with container.chat_message("user"):
+                st.write(user_input)
+            # Answer logic
+            with container.chat_message("assistant"):
+                with st.spinner("Thinking..."):
+                    if selected_source == db_name:
+                        # Handle /sql and /plot commands
+                        if user_input.strip().lower().startswith('/sql'):
+                            sql = user_input[len('/sql'):].strip()
+                            try:
+                                df = pd.read_sql_query(sql, sqlite3.connect(DB_PATH))
+                                st.write(f"```sql\n{sql}\n```")
+                                st.table(df)
+                                chat_history.append({"role": "assistant", "content": f"```sql\n{sql}\n```", "timestamp": datetime.now()})
+                            except Exception as e:
+                                err = f"SQL Error: {e}"
+                                st.error(err)
+                                chat_history.append({"role": "assistant", "content": err, "timestamp": datetime.now()})
+                        elif user_input.strip().lower().startswith('/plot'):
+                            sql = user_input[len('/plot'):].strip()
+                            try:
+                                tool = PlotSQLTool()
+                                md = tool._run(sql)
+                                st.markdown(md)
+                                m = re.search(r'!\[.*\]\((.*?)\)', md)
+                                if m:
+                                    st.image(m.group(1))
+                                chat_history.append({"role": "assistant", "content": md, "timestamp": datetime.now()})
+                            except Exception as e:
+                                err = f"Plot Error: {e}"
+                                st.error(err)
+                                chat_history.append({"role": "assistant", "content": err, "timestamp": datetime.now()})
+                        else:
+                            # Use SQL agent as before
+                            state['question'] = user_input
+                            try:
+                                for step in agent.graph.stream(state, stream_mode="updates"):
+                                    step_name, step_details = next(iter(step.items()))
+                                    if step_name == 'generate_sql':
+                                        with st.expander("SQL Generated", expanded=False):
+                                            st.markdown(f"```sql\n{step_details.get('sql_query', '')}\n```")
+                                    elif step_name == 'execute_sql':
+                                        with st.expander("SQL Result", expanded=False):
+                                            st.table(step_details.get('sql_result', pd.DataFrame()))
+                                    elif step_name == 'generate_answer':
+                                        st.write(step_details.get('answer', ''))
+                                        chat_history.append({"role": "assistant", "content": step_details.get('answer', ''), "timestamp": datetime.now()})
+                                    elif step_name == 'render_visualization':
+                                        # with st.expander("Chart", expanded=False):
+                                        st.image(step_details.get('visualization_output', ''))
+                            except Exception as e:
+                                err = f"SQL Agent Error: {e}"
+                                st.error(err)
+                                chat_history.append({"role": "assistant", "content": err, "timestamp": datetime.now()})
+                    else:
+                        # Use DataFrame agent for selected CSV
+                        csv_file = next((csv for csv in csv_files if csv['name'] == selected_source), None)
+                        if csv_file:
+                            if 'csv_agents' not in st.session_state:
+                                st.session_state['csv_agents'] = {}
+                            if selected_source not in st.session_state['csv_agents']:
+                                st.session_state['csv_agents'][selected_source] = get_dataframe_agent(csv_file['df'])
+                            agent = st.session_state['csv_agents'][selected_source]
+                            try:
+                                response = agent.invoke(user_input)
+                                answer = response["output"] if isinstance(response, dict) and "output" in response else str(response)
+                            except Exception as e:
+                                answer = f"CSV Agent Error: {e}"
+                            st.write(answer)
+                            chat_history.append({"role": "assistant", "content": answer, "timestamp": datetime.now()})
+                    # Refresh to update History immediately
+                    # st.experimental_rerun()
+        # Past Conversations Panel
+        with st.container(height=200):
+            st.markdown("**Recent Conversations**")
+            # Flatten and sort conversations by most recent first
+            entries = []
+            for source, convs in st.session_state.get('all_conversations', {}).items():
+                for conv in convs:
+                    entries.append((source, conv))
+            entries = sorted(entries, key=lambda x: x[1]['timestamp'], reverse=True)
+            for source, conv in entries:
+                label = conv['timestamp'].strftime("%Y-%m-%d %H:%M:%S")
+                with st.expander(f"{source} - {label}", expanded=False):
+                    for msg in conv['messages']:
+                        if msg.get('role') == 'user':
+                            st.chat_message('user').write(msg.get('content'))
+                        else:
+                            st.chat_message('assistant').write(msg.get('content'))

db/csv/category.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+id,name
+1,Laptop
+2,Tablet
+3,Smartphone
+4,Accessory
+5,Wearable

db/csv/laptop.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+id,product_id,ram,storage,processor
+1,1,8,256,Intel i5
+2,2,16,512,Intel i7
+3,7,32,1024,Intel i9
+4,1,4,128,Intel i3
+5,7,64,2048,AMD Ryzen 9

db/csv/product.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+id,name,price,promotion_id,category_id
+1,Basic Laptop,799.99,1,1
+2,High-end Laptop,1999.99,2,1
+3,Standard Tablet,499.99,1,2
+4,Pro Tablet,999.99,3,2
+5,Smartphone Model A,699.99,2,3
+6,Smartphone Model B,899.99,3,3
+7,Ultra Laptop,2499.99,4,1
+8,Mini Tablet,299.99,4,2
+9,Smartphone Model C,799.99,5,3
+10,Smartphone Model D,999.99,6,3

db/csv/promotion.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+id,description,discount
+1,Spring Sale,0.1
+2,Black Friday,0.25
+3,Clearance,0.5
+4,Summer Sale,0.15
+5,Cyber Monday,0.20
+6,Holiday Sale,0.30

db/csv/smartphone.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+id,product_id,camera_megapixels,os,battery
+1,5,12.0,Android,3000
+2,6,48.0,iOS,3500
+3,5,16.0,Android,3200
+4,6,20.0,iOS,3100
+5,5,64.0,Android,3400
+6,6,108.0,iOS,3800
+7,5,12.0,Android,2900
+8,6,48.0,Android,3300
+9,5,32.0,Android,3500
+10,6,24.0,iOS,3000

db/csv/tablet.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+id,product_id,screen_size,battery,support_sim
+1,3,10.1,6000,0
+2,4,12.9,8000,1
+3,8,8.0,4200,1
+4,4,13.3,10000,0

db/sample_ecommerce.db ADDED Viewed

Binary file (32.8 kB). View file

pytest.ini ADDED Viewed

	@@ -0,0 +1,8 @@

+[pytest]
+# Only run tests in the pytest_tests directory
+testpaths = tests/pytest_tests
+python_files = test_*.py
+log_cli = true
+log_cli_level = INFO
+log_cli_format = %(asctime)s [%(levelname)s] %(message)s
+log_cli_date_format = %H:%M:%S

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+streamlit== 1.36.0
+pandas==2.2.0
+numpy==1.26.4
+langchain==0.3.25
+langchain_core==0.3.58
+langchain-google-genai==2.0.4
+langgraph==0.3.31
+python-dotenv==1.0.1
+sqlalchemy==2.0.2
+guardrails-ai==0.6.6
+openpyxl==3.1.5
+pydantic==2.9.2

utils/consts.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+from os.path import abspath, dirname, join
+# Root of the project
+ROOT_DIR = abspath(join(dirname(__file__), '..'))
+# Database directory and path
+DB_DIR = join(ROOT_DIR, 'db')
+os.makedirs(DB_DIR, exist_ok=True)
+DB_PATH = join(DB_DIR, 'sample_ecommerce.db')
+# Output directory for plots
+PLOTS_DIR = join(ROOT_DIR, 'output', 'plots')
+os.makedirs(PLOTS_DIR, exist_ok=True)