Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

App Files Files Community

Alikestocode commited on 30 days ago

Commit

98c0d4d

1 Parent(s): 4b47dea

Use model-specific AWQ configs (Gemma group_size=64)

Browse files

Files changed (1) hide show

quantize_to_awq_colab.ipynb +40 -2

quantize_to_awq_colab.ipynb CHANGED Viewed

@@ -104,6 +104,44 @@
     "## 3. Configuration\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -524,7 +562,7 @@
     "    repo_id=MODELS_TO_QUANTIZE[\"router-gemma3-merged\"][\"repo_id\"],\n",
     "    output_repo=MODELS_TO_QUANTIZE[\"router-gemma3-merged\"][\"output_repo\"],\n",
     "    model_type=MODELS_TO_QUANTIZE[\"router-gemma3-merged\"][\"model_type\"],\n",
-    "    awq_config=AWQ_CONFIG,\n",
     "    calibration_dataset_size=128\n",
     ")\n"
    ]
@@ -547,7 +585,7 @@
     "    repo_id=MODELS_TO_QUANTIZE[\"router-qwen3-32b-merged\"][\"repo_id\"],\n",
     "    output_repo=MODELS_TO_QUANTIZE[\"router-qwen3-32b-merged\"][\"output_repo\"],\n",
     "    model_type=MODELS_TO_QUANTIZE[\"router-qwen3-32b-merged\"][\"model_type\"],\n",
-    "    awq_config=AWQ_CONFIG,\n",
     "    calibration_dataset_size=128\n",
     ")\n"
    ]

     "## 3. Configuration\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model-specific AWQ overrides. Keys match MODELS_TO_QUANTIZE entries.\n",
+    "MODEL_AWQ_OVERRIDES = {\n",
+    "    \"router-gemma3-merged\": {\"group_size\": 64},\n",
+    "}\n",
+    "\n",
+    "# Derived AWQ configs per model (defaults + overrides)\n",
+    "MODEL_AWQ_CONFIGS = {\n",
+    "    model_key: {**AWQ_CONFIG, **MODEL_AWQ_OVERRIDES.get(model_key, {})}\n",
+    "    for model_key in MODELS_TO_QUANTIZE.keys()\n",
+    "}\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model-specific overrides for AWQ quantization\n",
+    "# These are merged into the base AWQ config inside quantize_model_to_awq\n",
+    "MODEL_AWQ_OVERRIDES = {\n",
+    "    # Gemma 3 uses linear layers whose column widths are not divisible by 128.\n",
+    "    # Using group_size=64 avoids quantization failures while retaining accuracy.\n",
+    "    \"gemma\": {\n",
+    "        \"group_size\": 64,\n",
+    "    },\n",
+    "    # Add additional overrides keyed by model_type as needed\n",
+    "}\n",
+    "\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
     "    repo_id=MODELS_TO_QUANTIZE[\"router-gemma3-merged\"][\"repo_id\"],\n",
     "    output_repo=MODELS_TO_QUANTIZE[\"router-gemma3-merged\"][\"output_repo\"],\n",
     "    model_type=MODELS_TO_QUANTIZE[\"router-gemma3-merged\"][\"model_type\"],\n",
+    "    awq_config=MODEL_AWQ_CONFIGS[\"router-gemma3-merged\"],\n",
     "    calibration_dataset_size=128\n",
     ")\n"
    ]
     "    repo_id=MODELS_TO_QUANTIZE[\"router-qwen3-32b-merged\"][\"repo_id\"],\n",
     "    output_repo=MODELS_TO_QUANTIZE[\"router-qwen3-32b-merged\"][\"output_repo\"],\n",
     "    model_type=MODELS_TO_QUANTIZE[\"router-qwen3-32b-merged\"][\"model_type\"],\n",
+    "    awq_config=MODEL_AWQ_CONFIGS[\"router-qwen3-32b-merged\"],\n",
     "    calibration_dataset_size=128\n",
     ")\n"
    ]