"""Configuration for HuggingFace Inference Providers models. Based on: https://huggingface.co/inference/models This module provides model and provider configurations with verification capabilities to ensure models are actually available on selected providers. """ from typing import TypedDict class ModelProvider(TypedDict): """Provider information for a model.""" name: str input_cost: float | None # $/1M tokens output_cost: float | None # $/1M tokens latency: float | None # seconds throughput: float | None # tokens/second supports_tools: bool supports_structured: bool requires_auth: bool # Whether this provider requires authentication class InferenceModel(TypedDict): """Model configuration with available providers.""" model_id: str display_name: str providers: dict[str, ModelProvider] requires_auth: bool # Whether the model itself requires authentication (gated) description: str # Latest Reasoning Models from https://huggingface.co/inference/models # Updated with latest reasoning models (Qwen3-Next, Qwen3-235B, Llama-3.3, etc.) INFERENCE_MODELS: dict[str, InferenceModel] = { # Top-tier reasoning models (latest) "Qwen/Qwen3-Next-80B-A3B-Thinking": { "model_id": "Qwen/Qwen3-Next-80B-A3B-Thinking", "display_name": "Qwen3-Next-80B-A3B-Thinking", "requires_auth": True, # Gated "description": "Qwen's latest reasoning model - Advanced thinking capabilities, 262K context", "providers": { "together": { "name": "together", "input_cost": 0.15, "output_cost": 1.5, "latency": 0.48, "throughput": 202.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, "together-fastest": { "name": "together-fastest", "input_cost": 0.15, "output_cost": 1.5, "latency": 0.48, "throughput": 202.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, }, }, "Qwen/Qwen3-Next-80B-A3B-Instruct": { "model_id": "Qwen/Qwen3-Next-80B-A3B-Instruct", "display_name": "Qwen3-Next-80B-A3B-Instruct", "requires_auth": True, # Gated "description": "Qwen's latest instruction model - High performance, 262K context", "providers": { "together": { "name": "together", "input_cost": 0.15, "output_cost": 1.5, "latency": 0.60, "throughput": 153.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, "together-fastest": { "name": "together-fastest", "input_cost": 0.15, "output_cost": 1.5, "latency": 0.60, "throughput": 153.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, }, }, "Qwen/Qwen3-235B-A22B-Instruct-2507": { "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507", "display_name": "Qwen3-235B-A22B-Instruct", "requires_auth": True, # Gated "description": "Qwen's massive 235B model - Ultra-high performance, 262K context", "providers": { "cerebras": { "name": "cerebras", "input_cost": 0.6, "output_cost": 1.2, "latency": 0.23, "throughput": 509.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, "cerebras-fastest": { "name": "cerebras-fastest", "input_cost": 0.6, "output_cost": 1.2, "latency": 0.23, "throughput": 509.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, "together": { "name": "together", "input_cost": 0.2, "output_cost": 0.6, "latency": 0.39, "throughput": 42.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, }, }, "Qwen/Qwen3-235B-A22B-Thinking-2507": { "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507", "display_name": "Qwen3-235B-A22B-Thinking", "requires_auth": True, # Gated "description": "Qwen's massive 235B reasoning model - Advanced thinking, 262K context", "providers": { "cerebras": { "name": "cerebras", "input_cost": None, "output_cost": None, "latency": None, "throughput": None, "supports_tools": False, "supports_structured": False, "requires_auth": True, }, }, }, "meta-llama/Llama-3.3-70B-Instruct": { "model_id": "meta-llama/Llama-3.3-70B-Instruct", "display_name": "Llama 3.3 70B Instruct", "requires_auth": True, # Gated "description": "Meta's latest Llama 3.3 - High performance, tools support", "providers": { "cerebras": { "name": "cerebras", "input_cost": 0.85, "output_cost": 1.2, "latency": 0.35, "throughput": 948.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, "cerebras-fastest": { "name": "cerebras-fastest", "input_cost": 0.85, "output_cost": 1.2, "latency": 0.35, "throughput": 948.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, }, }, "openai/gpt-oss-120b": { "model_id": "openai/gpt-oss-120b", "display_name": "GPT-OSS-120B", "requires_auth": True, # Gated "description": "OpenAI's open-source 120B model - Ultra-fast inference", "providers": { "cerebras": { "name": "cerebras", "input_cost": 0.25, "output_cost": 0.69, "latency": 0.23, "throughput": 1051.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, "cerebras-fastest": { "name": "cerebras-fastest", "input_cost": 0.25, "output_cost": 0.69, "latency": 0.23, "throughput": 1051.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, }, }, "CohereLabs/command-a-reasoning-08-2025": { "model_id": "CohereLabs/command-a-reasoning-08-2025", "display_name": "Command A Reasoning 08-2025", "requires_auth": True, # Gated "description": "Cohere's latest reasoning model - Specialized for reasoning tasks", "providers": { "cohere": { "name": "cohere", "input_cost": None, "output_cost": None, "latency": 0.18, "throughput": 94.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, }, }, "zai-org/GLM-4.6": { "model_id": "zai-org/GLM-4.6", "display_name": "GLM-4.6", "requires_auth": True, # Gated "description": "ZAI's GLM-4.6 - High performance reasoning model", "providers": { "cerebras": { "name": "cerebras", "input_cost": None, "output_cost": None, "latency": 0.27, "throughput": 381.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, "cerebras-fastest": { "name": "cerebras-fastest", "input_cost": None, "output_cost": None, "latency": 0.27, "throughput": 381.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, "zai-org": { "name": "zai-org", "input_cost": None, "output_cost": None, "latency": 3.08, "throughput": 54.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, }, }, "meta-llama/Llama-3.1-8B-Instruct": { "model_id": "meta-llama/Llama-3.1-8B-Instruct", "display_name": "Llama 3.1 8B Instruct", "requires_auth": True, # Gated "description": "Meta's Llama 3.1 8B - Fast, efficient reasoning", "providers": { "novita": { "name": "novita", "input_cost": 0.02, "output_cost": 0.05, "latency": 0.64, "throughput": 84.0, "supports_tools": False, "supports_structured": False, "requires_auth": True, }, "nebius": { "name": "nebius", "input_cost": 0.03, "output_cost": 0.09, "latency": 0.35, "throughput": 194.0, "supports_tools": False, "supports_structured": True, "requires_auth": True, }, "cerebras": { "name": "cerebras", "input_cost": 0.1, "output_cost": 0.1, "latency": 0.33, "throughput": 1148.0, "supports_tools": False, "supports_structured": False, "requires_auth": True, }, "sambanova": { "name": "sambanova", "input_cost": 0.1, "output_cost": 0.2, "latency": 0.85, "throughput": 527.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, }, }, "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": { "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "display_name": "DeepSeek R1 Distill Llama 70B", "requires_auth": True, # Gated "description": "DeepSeek's reasoning model - Advanced chain-of-thought", "providers": { "novita": { "name": "novita", "input_cost": 0.64, "output_cost": 0.64, "latency": 1.21, "throughput": 31.0, "supports_tools": False, "supports_structured": False, "requires_auth": True, }, "sambanova": { "name": "sambanova", "input_cost": 0.7, "output_cost": 1.4, "latency": 2.67, "throughput": 158.0, "supports_tools": False, "supports_structured": False, "requires_auth": True, }, "nscale": { "name": "nscale", "input_cost": 0.75, "output_cost": 0.75, "latency": 1.24, "throughput": 16.0, "supports_tools": False, "supports_structured": False, "requires_auth": True, }, }, }, "moonshotai/Kimi-K2-Thinking": { "model_id": "moonshotai/Kimi-K2-Thinking", "display_name": "Kimi K2 Thinking", "requires_auth": True, # Gated "description": "Moonshot AI's thinking model - Long context reasoning", "providers": { "novita": { "name": "novita", "input_cost": 0.48, "output_cost": 2.0, "latency": 1.60, "throughput": 16.0, "supports_tools": True, "supports_structured": False, "requires_auth": True, }, "nebius": { "name": "nebius", "input_cost": 0.6, "output_cost": 2.5, "latency": 0.34, "throughput": 87.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, "together": { "name": "together", "input_cost": 1.2, "output_cost": 4.0, "latency": 0.86, "throughput": 97.0, "supports_tools": True, "supports_structured": True, "requires_auth": True, }, }, }, "allenai/Olmo-3-7B-Instruct": { "model_id": "allenai/Olmo-3-7B-Instruct", "display_name": "Olmo 3 7B Instruct", "requires_auth": False, # Ungated "description": "AllenAI's open model - Good reasoning, no auth needed", "providers": { "publicai": { "name": "publicai", "input_cost": None, "output_cost": None, "latency": 1.78, "throughput": 36.0, "supports_tools": True, "supports_structured": True, "requires_auth": False, }, }, }, "Qwen/Qwen2-7B-Instruct": { "model_id": "Qwen/Qwen2-7B-Instruct", "display_name": "Qwen2 7B Instruct", "requires_auth": False, # Ungated "description": "Qwen's efficient model - Fast, no authentication", "providers": { "featherless-ai": { "name": "featherless-ai", "input_cost": None, "output_cost": None, "latency": None, "throughput": None, "supports_tools": False, "supports_structured": False, "requires_auth": False, }, }, }, "HuggingFaceH4/zephyr-7b-beta": { "model_id": "HuggingFaceH4/zephyr-7b-beta", "display_name": "Zephyr 7B Beta", "requires_auth": False, # Ungated "description": "HuggingFace's fine-tuned model - Free tier friendly", "providers": { "featherless-ai": { "name": "featherless-ai", "input_cost": None, "output_cost": None, "latency": None, "throughput": None, "supports_tools": False, "supports_structured": False, "requires_auth": False, }, }, }, "google/gemma-2-2b-it": { "model_id": "google/gemma-2-2b-it", "display_name": "Gemma 2 2B IT", "requires_auth": True, # Gated "description": "Google's compact model - Small but capable", "providers": { "nebius": { "name": "nebius", "input_cost": None, "output_cost": None, "latency": None, "throughput": None, "supports_tools": False, "supports_structured": False, "requires_auth": True, }, }, }, "microsoft/Phi-3-mini-4k-instruct": { "model_id": "microsoft/Phi-3-mini-4k-instruct", "display_name": "Phi-3 Mini 4K Instruct", "requires_auth": False, # Ungated "description": "Microsoft's efficient model - Fast inference", "providers": { "featherless-ai": { "name": "featherless-ai", "input_cost": None, "output_cost": None, "latency": None, "throughput": None, "supports_tools": False, "supports_structured": False, "requires_auth": False, }, }, }, } def get_available_models(has_auth: bool = False) -> list[tuple[str, str]]: """ Get list of available models based on authentication status. Args: has_auth: Whether user has authentication (OAuth or HF_TOKEN) Returns: List of (model_id, display_name) tuples for dropdown """ models = [] for model_id, model_info in INFERENCE_MODELS.items(): # If no auth, only show ungated models if not has_auth and model_info["requires_auth"]: continue models.append((model_id, model_info["display_name"])) return models def get_available_providers(model_id: str, has_auth: bool = False) -> list[tuple[str, str]]: """ Get list of available providers for a model based on authentication. This is a convenience wrapper around get_available_providers_verified that doesn't perform async verification. Args: model_id: The model ID has_auth: Whether user has authentication Returns: List of (provider_name, display_name) tuples for dropdown """ return get_available_providers_verified(model_id, has_auth=has_auth, verify=False) def get_model_info(model_id: str) -> InferenceModel | None: """Get model information.""" return INFERENCE_MODELS.get(model_id) def get_provider_info(model_id: str, provider_name: str) -> ModelProvider | None: """Get provider information for a model.""" model = INFERENCE_MODELS.get(model_id) if not model: return None return model["providers"].get(provider_name) def verify_provider_availability( model_id: str, provider_name: str, ) -> bool: """ Verify that a model is available on the specified provider (static check). This function checks the static configuration to see if a provider is listed for the model. For dynamic verification via API calls, use verify_provider_availability_async(). Args: model_id: The model ID to verify provider_name: The provider name to verify Returns: True if the model is configured for the provider, False otherwise """ model_config = INFERENCE_MODELS.get(model_id) if not model_config: return False providers = model_config.get("providers", {}) return provider_name in providers async def verify_provider_availability_async( model_id: str, provider_name: str, api_key: str | None = None, ) -> bool: """ Verify that a model is actually available on the specified provider via API. This function attempts to check if the model/provider combination is valid by making a lightweight API call to the HuggingFace Inference API. Note: This is an async function and should be called from an async context. For synchronous checks, use verify_provider_availability(). Args: model_id: The model ID to verify provider_name: The provider name to verify api_key: Optional API key for authentication (uses env vars if not provided) Returns: True if the model is available on the provider, False otherwise """ # For now, fall back to static check # TODO: Implement actual API verification when needed return verify_provider_availability(model_id, provider_name) def get_available_providers_verified( model_id: str, has_auth: bool = False, api_key: str | None = None, verify: bool = False, ) -> list[tuple[str, str]]: """ Get list of available providers for a model with optional verification. Args: model_id: The model ID has_auth: Whether user has authentication api_key: Optional API key for verification verify: Whether to verify provider availability (async, requires api_key) Returns: List of (provider_name, display_name) tuples for dropdown """ if model_id not in INFERENCE_MODELS: return [] model = INFERENCE_MODELS[model_id] providers = [] for provider_name, provider_info in model["providers"].items(): # If no auth, only show providers that don't require auth if not has_auth and provider_info["requires_auth"]: continue # Create display name with cost/latency info display_parts = [provider_name] if provider_info["latency"]: display_parts.append(f"{provider_info['latency']:.2f}s") if provider_info["input_cost"]: display_parts.append(f"${provider_info['input_cost']}/1M") if provider_info["supports_tools"]: display_parts.append("🔧") if provider_info["supports_structured"]: display_parts.append("📊") display_name = " | ".join(display_parts) providers.append((provider_name, display_name)) # Note: If verify=True, this should be called from an async context # For now, we return static providers. Async verification can be done separately. return providers