cfahlgren1 HF Staff commited on
Commit
b7c3fee
·
1 Parent(s): 1f9050d

make it adhere to chatgpt spec

Browse files
Files changed (2) hide show
  1. src/index.ts +41 -128
  2. src/utils.ts +97 -0
src/index.ts CHANGED
@@ -2,71 +2,62 @@
2
 
3
  import { Server } from "@modelcontextprotocol/sdk/server/index.js";
4
  import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
 
5
  import {
6
  CallToolRequestSchema,
7
  ListToolsRequestSchema,
8
  Tool,
9
  } from "@modelcontextprotocol/sdk/types.js";
10
- import { z } from "zod";
11
  import { createServer, IncomingMessage, ServerResponse } from "node:http";
12
  import { URL } from "node:url";
13
-
14
- const FetchDatasetRowsSchema = z.object({
15
- dataset: z.string().describe("The dataset ID (e.g., 'ibm/duorc')"),
16
- config: z.string().default("default").describe("The dataset configuration (e.g., 'SelfRC')"),
17
- split: z.string().default("train").describe("The dataset split (e.g., 'train', 'validation', 'test')"),
18
- offset: z.number().int().min(0).default(0).describe("Starting row offset"),
19
- length: z.number().int().min(1).max(100).default(10).describe("Number of rows to fetch (max: 100)"),
 
20
  });
21
 
22
- const SearchDatasetsSchema = z.object({
23
- query: z.string().optional().describe("Search query for datasets (optional - if omitted, returns popular datasets)"),
24
- limit: z.number().int().min(1).max(50).default(20).describe("Maximum number of results to return (max: 50)"),
25
- });
26
-
27
- const GetDatasetInfoSchema = z.object({
28
- dataset: z.string().describe("The dataset ID (e.g., 'ibm/duorc')"),
29
- config: z.string().optional().describe("The dataset configuration name (optional)"),
30
  });
31
 
32
  const TOOL_DEFS: Tool[] = [
33
  {
34
- name: "search_datasets",
35
  description: "Search for datasets on Hugging Face Hub",
36
  inputSchema: {
37
  type: "object",
38
  properties: {
39
- query: { type: "string", description: "Search query for datasets (optional - if omitted, returns popular datasets)" },
40
- limit: { type: "number", description: "Maximum number of results to return (max: 50)", default: 20 },
41
- },
42
- required: [],
43
- },
44
- },
45
- {
46
- name: "get_dataset_info",
47
- description: "Get detailed information about a specific dataset including splits, configs, and features",
48
- inputSchema: {
49
- type: "object",
50
- properties: {
51
- dataset: { type: "string", description: "The dataset ID (e.g., 'ibm/duorc')" },
52
- config: { type: "string", description: "The dataset configuration name (optional)" },
53
  },
54
- required: ["dataset"],
55
  },
56
  },
57
  {
58
- name: "fetch_dataset_rows",
59
- description: "Fetch paginated rows from a Hugging Face dataset",
 
60
  inputSchema: {
61
  type: "object",
62
  properties: {
63
- dataset: { type: "string", description: "The dataset ID (e.g., 'ibm/duorc')" },
64
- config: { type: "string", description: "The dataset configuration (e.g., 'SelfRC')", default: "default" },
65
- split: { type: "string", description: "The dataset split (e.g., 'train', 'validation', 'test')", default: "train" },
66
- offset: { type: "number", description: "Starting row offset", default: 0 },
67
- length: { type: "number", description: "Number of rows to fetch (max: 100)", default: 10 },
68
  },
69
- required: ["dataset"],
70
  },
71
  },
72
  ];
@@ -83,98 +74,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
83
 
84
  try {
85
  switch (name) {
86
- case "search_datasets": {
87
- const validatedArgs = SearchDatasetsSchema.parse(args);
88
- const { query, limit } = validatedArgs;
89
-
90
- const url = new URL("https://huggingface.co/api/datasets");
91
- const params = new URLSearchParams({ limit: String(limit) });
92
- if (query) {
93
- params.set("search", query);
94
- }
95
- url.search = params.toString();
96
 
97
- const response = await fetch(url);
98
- if (!response.ok) {
99
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
100
- }
101
- const datasets = await response.json();
102
-
103
- const results = datasets.map((dataset: any) => ({
104
- id: dataset.id,
105
- title: dataset.id,
106
- url: `https://huggingface.co/datasets/${dataset.id}`,
107
- description: dataset.description || "",
108
- author: dataset.author,
109
- downloads: dataset.downloads || 0,
110
- likes: dataset.likes || 0,
111
- tags: dataset.tags || [],
112
- }));
113
-
114
- return {
115
- content: [{
116
- type: "text",
117
- text: JSON.stringify({ results }, null, 2)
118
- }]
119
- };
120
- }
121
-
122
- case "get_dataset_info": {
123
- const validatedArgs = GetDatasetInfoSchema.parse(args);
124
- const { dataset, config } = validatedArgs;
125
-
126
- const url = new URL("https://datasets-server.huggingface.co/info");
127
- const params = new URLSearchParams({ dataset });
128
- if (config) {
129
- params.set("config", config);
130
- }
131
- url.search = params.toString();
132
-
133
- const response = await fetch(url);
134
- if (!response.ok) {
135
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
136
- }
137
- const result = await response.json();
138
-
139
- const info = {
140
- id: dataset,
141
- title: dataset,
142
- text: JSON.stringify(result.dataset_info, null, 2),
143
- url: `https://huggingface.co/datasets/${dataset}`,
144
- metadata: {
145
- source: "huggingface_datasets_server",
146
- config: config,
147
- partial: result.partial,
148
- }
149
- };
150
-
151
- return {
152
- content: [{
153
- type: "text",
154
- text: JSON.stringify(info, null, 2)
155
- }]
156
- };
157
  }
158
 
159
- case "fetch_dataset_rows": {
160
- const validatedArgs = FetchDatasetRowsSchema.parse(args);
161
- const { dataset, config, split, offset, length } = validatedArgs;
162
-
163
- const url = new URL("https://datasets-server.huggingface.co/rows");
164
- url.search = new URLSearchParams({
165
- dataset,
166
- config,
167
- split,
168
- offset: String(offset),
169
- length: String(length),
170
- }).toString();
171
 
172
- const response = await fetch(url);
173
- if (!response.ok) {
174
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
175
- }
176
- const result = await response.json();
177
- return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
178
  }
179
 
180
  default:
 
2
 
3
  import { Server } from "@modelcontextprotocol/sdk/server/index.js";
4
  import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
5
+ import { z } from "zod";
6
  import {
7
  CallToolRequestSchema,
8
  ListToolsRequestSchema,
9
  Tool,
10
  } from "@modelcontextprotocol/sdk/types.js";
 
11
  import { createServer, IncomingMessage, ServerResponse } from "node:http";
12
  import { URL } from "node:url";
13
+ import { searchDatasets, fetchDatasetAggregate, textContent } from "./utils.ts";
14
+
15
+ const SearchSchema = z.object({
16
+ query: z
17
+ .string()
18
+ .describe(
19
+ "Search query for datasets (e.g., 'nlp sentiment analysis', 'computer vision', 'question answering')",
20
+ ),
21
  });
22
 
23
+ const FetchSchema = z.object({
24
+ id: z
25
+ .string()
26
+ .describe(
27
+ "Unique dataset identifier (e.g., 'squad', 'imdb', 'cfahlgren1/hub-stats')",
28
+ ),
 
 
29
  });
30
 
31
  const TOOL_DEFS: Tool[] = [
32
  {
33
+ name: "search",
34
  description: "Search for datasets on Hugging Face Hub",
35
  inputSchema: {
36
  type: "object",
37
  properties: {
38
+ query: {
39
+ type: "string",
40
+ description:
41
+ "Search query for datasets (e.g., 'nlp sentiment analysis', 'computer vision', 'question answering')",
42
+ },
 
 
 
 
 
 
 
 
 
43
  },
44
+ required: ["query"],
45
  },
46
  },
47
  {
48
+ name: "fetch",
49
+ description:
50
+ "Retrieve full information and sample data for a specific dataset",
51
  inputSchema: {
52
  type: "object",
53
  properties: {
54
+ id: {
55
+ type: "string",
56
+ description:
57
+ "Unique dataset identifier (e.g., 'squad', 'imdb', 'cfahlgren1/hub-stats')",
58
+ },
59
  },
60
+ required: ["id"],
61
  },
62
  },
63
  ];
 
74
 
75
  try {
76
  switch (name) {
77
+ case "search": {
78
+ const validatedArgs = SearchSchema.parse(args);
79
+ const { query } = validatedArgs;
 
 
 
 
 
 
 
80
 
81
+ const results = await searchDatasets(query);
82
+ return textContent({ results });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  }
84
 
85
+ case "fetch": {
86
+ const validatedArgs = FetchSchema.parse(args);
87
+ const { id: datasetId } = validatedArgs;
 
 
 
 
 
 
 
 
 
88
 
89
+ const fetchResult = await fetchDatasetAggregate(datasetId);
90
+ return textContent(fetchResult);
 
 
 
 
91
  }
92
 
93
  default:
src/utils.ts ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { URL } from "node:url";
2
+
3
+ export function buildUrl(base: string, params: Record<string, string>): URL {
4
+ const url = new URL(base);
5
+ url.search = new URLSearchParams(params).toString();
6
+ return url;
7
+ }
8
+
9
+ export async function fetchJson<T = any>(url: URL, context: string): Promise<T> {
10
+ const response = await fetch(url);
11
+ if (!response.ok) {
12
+ throw new Error(`${context}: HTTP ${response.status}`);
13
+ }
14
+ return response.json();
15
+ }
16
+
17
+ export async function getDefaultConfigAndSplit(datasetId: string): Promise<{ config: string; split: string }> {
18
+ const configsUrl = buildUrl("https://datasets-server.huggingface.co/configs", { dataset: datasetId });
19
+ const configsResult = await fetchJson<any>(configsUrl, "Failed to get configs");
20
+
21
+ const firstConfig = configsResult.configs?.[0]?.config as string | undefined;
22
+ if (!firstConfig) {
23
+ throw new Error("No configs found for dataset");
24
+ }
25
+
26
+ const infoUrl = buildUrl("https://datasets-server.huggingface.co/info", {
27
+ dataset: datasetId,
28
+ config: firstConfig,
29
+ });
30
+ const infoResult = await fetchJson<any>(infoUrl, "Failed to get dataset info");
31
+
32
+ const splits = infoResult.dataset_info?.splits || {};
33
+ const firstSplit = Object.keys(splits)[0];
34
+ if (!firstSplit) {
35
+ throw new Error("No splits found for dataset");
36
+ }
37
+
38
+ return { config: firstConfig, split: firstSplit };
39
+ }
40
+
41
+ export async function searchDatasets(query: string): Promise<Array<{ id: string; title: string; url: string }>> {
42
+ const url = buildUrl("https://huggingface.co/api/datasets", { search: query, limit: "20" });
43
+ const datasets = await fetchJson<any[]>(url, "Dataset search failed");
44
+ return datasets.map((dataset: any) => ({
45
+ id: dataset.id,
46
+ title: dataset.id,
47
+ url: `https://huggingface.co/datasets/${dataset.id}`,
48
+ }));
49
+ }
50
+
51
+ export async function fetchDatasetAggregate(datasetId: string): Promise<{
52
+ id: string;
53
+ title: string;
54
+ text: string;
55
+ url: string;
56
+ }> {
57
+ const { config, split } = await getDefaultConfigAndSplit(datasetId);
58
+
59
+ const infoUrl = buildUrl("https://datasets-server.huggingface.co/info", {
60
+ dataset: datasetId,
61
+ config,
62
+ });
63
+ const infoResult = await fetchJson<any>(infoUrl, "Failed to get dataset info");
64
+
65
+ const rowsUrl = buildUrl("https://datasets-server.huggingface.co/rows", {
66
+ dataset: datasetId,
67
+ config,
68
+ split,
69
+ offset: "0",
70
+ length: "50",
71
+ });
72
+
73
+ let sampleData = "";
74
+ try {
75
+ const rowsResponse = await fetch(rowsUrl);
76
+ if (rowsResponse.ok) {
77
+ const rowsResult = await rowsResponse.json();
78
+ sampleData = `\n\nSample data (${config}/${split}):\n${JSON.stringify(rowsResult.rows || [], null, 2)}`;
79
+ } else {
80
+ sampleData = "\n\nSample data: Not available";
81
+ }
82
+ } catch {
83
+ sampleData = "\n\nSample data: Not available";
84
+ }
85
+
86
+ return {
87
+ id: datasetId,
88
+ title: datasetId,
89
+ text: `Dataset Information:\n${JSON.stringify(infoResult.dataset_info, null, 2)}${sampleData}`,
90
+ url: `https://huggingface.co/datasets/${datasetId}`,
91
+ };
92
+ }
93
+
94
+ export function textContent(payload: unknown) {
95
+ return { content: [{ type: "text" as const, text: JSON.stringify(payload) }] };
96
+ }
97
+