datatrees / tools /paris_trees_info.py
alihmaou's picture
Deploy tools/paris_trees_info.py via Meta-MCP
387f73c verified
import gradio as gr
import json
from PIL import Image
import numpy as np
# --- User Defined Logic ---
import tempfile
import os
import requests
import duckdb
from typing import Dict, Any
def paris_trees_info() -> str:
"""
Downloads the Paris trees dataset (parquet format) and provides comprehensive information about the dataset structure including column names, data types, row count, and basic statistics. This helps users understand what data is available about tree protection efforts in Paris.
Returns:
str: A formatted string containing dataset information including number of rows, column names and types, and basic descriptive statistics.
"""
url = "https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/les-arbres/exports/parquet?lang=fr&timezone=Europe%2FBerlin"
# Download the parquet file
response = requests.get(url, stream=True)
response.raise_for_status()
# Save to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as tmp_file:
for chunk in response.iter_content(chunk_size=8192):
tmp_file.write(chunk)
tmp_path = tmp_file.name
try:
# Connect to DuckDB
conn = duckdb.connect()
# Load the parquet file
conn.execute(f"CREATE VIEW paris_trees AS SELECT * FROM read_parquet('{tmp_path}')")
# Get row count
row_count = conn.execute("SELECT COUNT(*) FROM paris_trees").fetchone()[0]
# Get column information
columns_info = conn.execute("DESCRIBE paris_trees").fetchall()
# Get summary statistics
summary = conn.execute("SUMMARIZE paris_trees").fetchall()
# Format the output
output = []
output.append("=== Paris Trees Dataset Information ===\n")
output.append(f"Total Rows: {row_count:,}\n")
output.append("\n=== Column Information ===")
output.append(f"{'Column Name':<30} {'Data Type':<20}")
output.append("-" * 50)
for col in columns_info:
output.append(f"{col[0]:<30} {col[1]:<20}")
output.append("\n=== Summary Statistics ===")
output.append(f"{'Column':<30} {'Min':<15} {'Max':<15} {'Avg':<15} {'Null %':<10}")
output.append("-" * 85)
for row in summary:
col_name = row[0]
min_val = str(row[2]) if row[2] is not None else "N/A"
max_val = str(row[3]) if row[3] is not None else "N/A"
avg_val = f"{row[4]:.2f}" if row[4] is not None else "N/A"
null_pct = f"{row[7]:.1f}" if row[7] is not None else "N/A"
output.append(f"{col_name:<30} {min_val:<15} {max_val:<15} {avg_val:<15} {null_pct:<10}")
conn.close()
return "\n".join(output)
finally:
# Clean up temporary file
if os.path.exists(tmp_path):
os.unlink(tmp_path)
# --- Interface Factory ---
def create_interface():
return gr.Interface(
fn=paris_trees_info,
inputs=[gr.Textbox(label=k) for k in []],
outputs=gr.Textbox(label="Comprehensive dataset information including row count, column details, and summary statistics"),
title="paris_trees_info",
description="Auto-generated tool: paris_trees_info"
)