|
|
import gradio as gr |
|
|
import json |
|
|
from PIL import Image |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
import tempfile |
|
|
import os |
|
|
import requests |
|
|
import duckdb |
|
|
from typing import Dict, Any |
|
|
|
|
|
def paris_trees_info() -> str: |
|
|
""" |
|
|
Downloads the Paris trees dataset (parquet format) and provides comprehensive information about the dataset structure including column names, data types, row count, and basic statistics. This helps users understand what data is available about tree protection efforts in Paris. |
|
|
|
|
|
Returns: |
|
|
str: A formatted string containing dataset information including number of rows, column names and types, and basic descriptive statistics. |
|
|
""" |
|
|
url = "https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/les-arbres/exports/parquet?lang=fr&timezone=Europe%2FBerlin" |
|
|
|
|
|
|
|
|
response = requests.get(url, stream=True) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") as tmp_file: |
|
|
for chunk in response.iter_content(chunk_size=8192): |
|
|
tmp_file.write(chunk) |
|
|
tmp_path = tmp_file.name |
|
|
|
|
|
try: |
|
|
|
|
|
conn = duckdb.connect() |
|
|
|
|
|
|
|
|
conn.execute(f"CREATE VIEW paris_trees AS SELECT * FROM read_parquet('{tmp_path}')") |
|
|
|
|
|
|
|
|
row_count = conn.execute("SELECT COUNT(*) FROM paris_trees").fetchone()[0] |
|
|
|
|
|
|
|
|
columns_info = conn.execute("DESCRIBE paris_trees").fetchall() |
|
|
|
|
|
|
|
|
summary = conn.execute("SUMMARIZE paris_trees").fetchall() |
|
|
|
|
|
|
|
|
output = [] |
|
|
output.append("=== Paris Trees Dataset Information ===\n") |
|
|
output.append(f"Total Rows: {row_count:,}\n") |
|
|
|
|
|
output.append("\n=== Column Information ===") |
|
|
output.append(f"{'Column Name':<30} {'Data Type':<20}") |
|
|
output.append("-" * 50) |
|
|
for col in columns_info: |
|
|
output.append(f"{col[0]:<30} {col[1]:<20}") |
|
|
|
|
|
output.append("\n=== Summary Statistics ===") |
|
|
output.append(f"{'Column':<30} {'Min':<15} {'Max':<15} {'Avg':<15} {'Null %':<10}") |
|
|
output.append("-" * 85) |
|
|
for row in summary: |
|
|
col_name = row[0] |
|
|
min_val = str(row[2]) if row[2] is not None else "N/A" |
|
|
max_val = str(row[3]) if row[3] is not None else "N/A" |
|
|
avg_val = f"{row[4]:.2f}" if row[4] is not None else "N/A" |
|
|
null_pct = f"{row[7]:.1f}" if row[7] is not None else "N/A" |
|
|
output.append(f"{col_name:<30} {min_val:<15} {max_val:<15} {avg_val:<15} {null_pct:<10}") |
|
|
|
|
|
conn.close() |
|
|
return "\n".join(output) |
|
|
|
|
|
finally: |
|
|
|
|
|
if os.path.exists(tmp_path): |
|
|
os.unlink(tmp_path) |
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
return gr.Interface( |
|
|
fn=paris_trees_info, |
|
|
inputs=[gr.Textbox(label=k) for k in []], |
|
|
outputs=gr.Textbox(label="Comprehensive dataset information including row count, column details, and summary statistics"), |
|
|
title="paris_trees_info", |
|
|
description="Auto-generated tool: paris_trees_info" |
|
|
) |