Spaces:

cfahlgren1
/

datasets-mcp-server

Running

App Files Files Community

cfahlgren1 HF Staff commited on Sep 11

Commit

b7c3fee

1 Parent(s): 1f9050d

make it adhere to chatgpt spec

Browse files

Files changed (2) hide show

src/index.ts +41 -128
src/utils.ts +97 -0

src/index.ts CHANGED Viewed

@@ -2,71 +2,62 @@
 import { Server } from "@modelcontextprotocol/sdk/server/index.js";
 import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
 import {
   CallToolRequestSchema,
   ListToolsRequestSchema,
   Tool,
 } from "@modelcontextprotocol/sdk/types.js";
-import { z } from "zod";
 import { createServer, IncomingMessage, ServerResponse } from "node:http";
 import { URL } from "node:url";
-const FetchDatasetRowsSchema = z.object({
-  dataset: z.string().describe("The dataset ID (e.g., 'ibm/duorc')"),
-  config: z.string().default("default").describe("The dataset configuration (e.g., 'SelfRC')"),
-  split: z.string().default("train").describe("The dataset split (e.g., 'train', 'validation', 'test')"),
-  offset: z.number().int().min(0).default(0).describe("Starting row offset"),
-  length: z.number().int().min(1).max(100).default(10).describe("Number of rows to fetch (max: 100)"),
 });
-const SearchDatasetsSchema = z.object({
-  query: z.string().optional().describe("Search query for datasets (optional - if omitted, returns popular datasets)"),
-  limit: z.number().int().min(1).max(50).default(20).describe("Maximum number of results to return (max: 50)"),
-});
-const GetDatasetInfoSchema = z.object({
-  dataset: z.string().describe("The dataset ID (e.g., 'ibm/duorc')"),
-  config: z.string().optional().describe("The dataset configuration name (optional)"),
 });
 const TOOL_DEFS: Tool[] = [
   {
-    name: "search_datasets",
     description: "Search for datasets on Hugging Face Hub",
     inputSchema: {
       type: "object",
       properties: {
-        query: { type: "string", description: "Search query for datasets (optional - if omitted, returns popular datasets)" },
-        limit: { type: "number", description: "Maximum number of results to return (max: 50)", default: 20 },
-      },
-      required: [],
-    },
-  },
-  {
-    name: "get_dataset_info",
-    description: "Get detailed information about a specific dataset including splits, configs, and features",
-    inputSchema: {
-      type: "object",
-      properties: {
-        dataset: { type: "string", description: "The dataset ID (e.g., 'ibm/duorc')" },
-        config: { type: "string", description: "The dataset configuration name (optional)" },
       },
-      required: ["dataset"],
     },
   },
   {
-    name: "fetch_dataset_rows",
-    description: "Fetch paginated rows from a Hugging Face dataset",
     inputSchema: {
       type: "object",
       properties: {
-        dataset: { type: "string", description: "The dataset ID (e.g., 'ibm/duorc')" },
-        config: { type: "string", description: "The dataset configuration (e.g., 'SelfRC')", default: "default" },
-        split: { type: "string", description: "The dataset split (e.g., 'train', 'validation', 'test')", default: "train" },
-        offset: { type: "number", description: "Starting row offset", default: 0 },
-        length: { type: "number", description: "Number of rows to fetch (max: 100)", default: 10 },
       },
-      required: ["dataset"],
     },
   },
 ];
@@ -83,98 +74,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
   try {
     switch (name) {
-      case "search_datasets": {
-        const validatedArgs = SearchDatasetsSchema.parse(args);
-        const { query, limit } = validatedArgs;
-        const url = new URL("https://huggingface.co/api/datasets");
-        const params = new URLSearchParams({ limit: String(limit) });
-        if (query) {
-          params.set("search", query);
-        }
-        url.search = params.toString();
-        const response = await fetch(url);
-        if (!response.ok) {
-          throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-        }
-        const datasets = await response.json();
-        const results = datasets.map((dataset: any) => ({
-          id: dataset.id,
-          title: dataset.id,
-          url: `https://huggingface.co/datasets/${dataset.id}`,
-          description: dataset.description || "",
-          author: dataset.author,
-          downloads: dataset.downloads || 0,
-          likes: dataset.likes || 0,
-          tags: dataset.tags || [],
-        }));
-        return {
-          content: [{
-            type: "text",
-            text: JSON.stringify({ results }, null, 2)
-          }]
-        };
-      }
-      case "get_dataset_info": {
-        const validatedArgs = GetDatasetInfoSchema.parse(args);
-        const { dataset, config } = validatedArgs;
-        const url = new URL("https://datasets-server.huggingface.co/info");
-        const params = new URLSearchParams({ dataset });
-        if (config) {
-          params.set("config", config);
-        }
-        url.search = params.toString();
-        const response = await fetch(url);
-        if (!response.ok) {
-          throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-        }
-        const result = await response.json();
-        const info = {
-          id: dataset,
-          title: dataset,
-          text: JSON.stringify(result.dataset_info, null, 2),
-          url: `https://huggingface.co/datasets/${dataset}`,
-          metadata: {
-            source: "huggingface_datasets_server",
-            config: config,
-            partial: result.partial,
-          }
-        };
-        return {
-          content: [{
-            type: "text",
-            text: JSON.stringify(info, null, 2)
-          }]
-        };
       }
-      case "fetch_dataset_rows": {
-        const validatedArgs = FetchDatasetRowsSchema.parse(args);
-        const { dataset, config, split, offset, length } = validatedArgs;
-        const url = new URL("https://datasets-server.huggingface.co/rows");
-        url.search = new URLSearchParams({
-          dataset,
-          config,
-          split,
-          offset: String(offset),
-          length: String(length),
-        }).toString();
-        const response = await fetch(url);
-        if (!response.ok) {
-          throw new Error(`HTTP ${response.status}: ${response.statusText}`);
-        }
-        const result = await response.json();
-        return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
       }
       default:

 import { Server } from "@modelcontextprotocol/sdk/server/index.js";
 import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
+import { z } from "zod";
 import {
   CallToolRequestSchema,
   ListToolsRequestSchema,
   Tool,
 } from "@modelcontextprotocol/sdk/types.js";
 import { createServer, IncomingMessage, ServerResponse } from "node:http";
 import { URL } from "node:url";
+import { searchDatasets, fetchDatasetAggregate, textContent } from "./utils.ts";
+const SearchSchema = z.object({
+  query: z
+    .string()
+    .describe(
+      "Search query for datasets (e.g., 'nlp sentiment analysis', 'computer vision', 'question answering')",
+    ),
 });
+const FetchSchema = z.object({
+  id: z
+    .string()
+    .describe(
+      "Unique dataset identifier (e.g., 'squad', 'imdb', 'cfahlgren1/hub-stats')",
+    ),
 });
 const TOOL_DEFS: Tool[] = [
   {
+    name: "search",
     description: "Search for datasets on Hugging Face Hub",
     inputSchema: {
       type: "object",
       properties: {
+        query: {
+          type: "string",
+          description:
+            "Search query for datasets (e.g., 'nlp sentiment analysis', 'computer vision', 'question answering')",
+        },
       },
+      required: ["query"],
     },
   },
   {
+    name: "fetch",
+    description:
+      "Retrieve full information and sample data for a specific dataset",
     inputSchema: {
       type: "object",
       properties: {
+        id: {
+          type: "string",
+          description:
+            "Unique dataset identifier (e.g., 'squad', 'imdb', 'cfahlgren1/hub-stats')",
+        },
       },
+      required: ["id"],
     },
   },
 ];
   try {
     switch (name) {
+      case "search": {
+        const validatedArgs = SearchSchema.parse(args);
+        const { query } = validatedArgs;
+        const results = await searchDatasets(query);
+        return textContent({ results });
       }
+      case "fetch": {
+        const validatedArgs = FetchSchema.parse(args);
+        const { id: datasetId } = validatedArgs;
+        const fetchResult = await fetchDatasetAggregate(datasetId);
+        return textContent(fetchResult);
       }
       default:

src/utils.ts ADDED Viewed

	@@ -0,0 +1,97 @@

+import { URL } from "node:url";
+export function buildUrl(base: string, params: Record<string, string>): URL {
+  const url = new URL(base);
+  url.search = new URLSearchParams(params).toString();
+  return url;
+}
+export async function fetchJson<T = any>(url: URL, context: string): Promise<T> {
+  const response = await fetch(url);
+  if (!response.ok) {
+    throw new Error(`${context}: HTTP ${response.status}`);
+  }
+  return response.json();
+}
+export async function getDefaultConfigAndSplit(datasetId: string): Promise<{ config: string; split: string }> {
+  const configsUrl = buildUrl("https://datasets-server.huggingface.co/configs", { dataset: datasetId });
+  const configsResult = await fetchJson<any>(configsUrl, "Failed to get configs");
+  const firstConfig = configsResult.configs?.[0]?.config as string | undefined;
+  if (!firstConfig) {
+    throw new Error("No configs found for dataset");
+  }
+  const infoUrl = buildUrl("https://datasets-server.huggingface.co/info", {
+    dataset: datasetId,
+    config: firstConfig,
+  });
+  const infoResult = await fetchJson<any>(infoUrl, "Failed to get dataset info");
+  const splits = infoResult.dataset_info?.splits || {};
+  const firstSplit = Object.keys(splits)[0];
+  if (!firstSplit) {
+    throw new Error("No splits found for dataset");
+  }
+  return { config: firstConfig, split: firstSplit };
+}
+export async function searchDatasets(query: string): Promise<Array<{ id: string; title: string; url: string }>> {
+  const url = buildUrl("https://huggingface.co/api/datasets", { search: query, limit: "20" });
+  const datasets = await fetchJson<any[]>(url, "Dataset search failed");
+  return datasets.map((dataset: any) => ({
+    id: dataset.id,
+    title: dataset.id,
+    url: `https://huggingface.co/datasets/${dataset.id}`,
+  }));
+}
+export async function fetchDatasetAggregate(datasetId: string): Promise<{
+  id: string;
+  title: string;
+  text: string;
+  url: string;
+}> {
+  const { config, split } = await getDefaultConfigAndSplit(datasetId);
+  const infoUrl = buildUrl("https://datasets-server.huggingface.co/info", {
+    dataset: datasetId,
+    config,
+  });
+  const infoResult = await fetchJson<any>(infoUrl, "Failed to get dataset info");
+  const rowsUrl = buildUrl("https://datasets-server.huggingface.co/rows", {
+    dataset: datasetId,
+    config,
+    split,
+    offset: "0",
+    length: "50",
+  });
+  let sampleData = "";
+  try {
+    const rowsResponse = await fetch(rowsUrl);
+    if (rowsResponse.ok) {
+      const rowsResult = await rowsResponse.json();
+      sampleData = `\n\nSample data (${config}/${split}):\n${JSON.stringify(rowsResult.rows || [], null, 2)}`;
+    } else {
+      sampleData = "\n\nSample data: Not available";
+    }
+  } catch {
+    sampleData = "\n\nSample data: Not available";
+  }
+  return {
+    id: datasetId,
+    title: datasetId,
+    text: `Dataset Information:\n${JSON.stringify(infoResult.dataset_info, null, 2)}${sampleData}`,
+    url: `https://huggingface.co/datasets/${datasetId}`,
+  };
+}
+export function textContent(payload: unknown) {
+  return { content: [{ type: "text" as const, text: JSON.stringify(payload) }] };
+}