Instructions to use evilsocket/Phi-4-mini-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use evilsocket/Phi-4-mini-instruct with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="evilsocket/Phi-4-mini-instruct", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("evilsocket/Phi-4-mini-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("evilsocket/Phi-4-mini-instruct", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use evilsocket/Phi-4-mini-instruct with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "evilsocket/Phi-4-mini-instruct"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "evilsocket/Phi-4-mini-instruct",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/evilsocket/Phi-4-mini-instruct

SGLang

How to use evilsocket/Phi-4-mini-instruct with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "evilsocket/Phi-4-mini-instruct" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "evilsocket/Phi-4-mini-instruct",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "evilsocket/Phi-4-mini-instruct" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "evilsocket/Phi-4-mini-instruct",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use evilsocket/Phi-4-mini-instruct with Docker Model Runner:
```
docker model run hf.co/evilsocket/Phi-4-mini-instruct
```

Phi-4-mini-instruct / sample_finetune.py

evilsocket

Duplicate from microsoft/Phi-4-mini-instruct

f228fba about 2 months ago

raw

history blame contribute delete

6.17 kB

	import sys
	import logging

	import datasets
	from datasets import load_dataset
	from peft import LoraConfig
	import torch
	import transformers
	from trl import SFTTrainer
	from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

	"""
	A simple example on using SFTTrainer and Accelerate to finetune Phi-4-Mini-Instruct model. For
	a more advanced example, please follow HF alignment-handbook/scripts/run_sft.py.
	This example has utilized DeepSpeed ZeRO3 offload to reduce the memory usage. The
	script can be run on V100 or later generation GPUs. Here are some suggestions on
	futher reducing memory consumption:
	- reduce batch size
	- decrease lora dimension
	- restrict lora target modules
	Please follow these steps to run the script:
	1. Install dependencies:
	conda install -c conda-forge accelerate=1.3.0
	pip3 install -i https://pypi.org/simple/ bitsandbytes
	pip3 install peft==0.14.0
	pip3 install transformers==4.48.1
	pip3 install trl datasets
	pip3 install deepspeed
	2. Setup accelerate and deepspeed config based on the machine used:
	accelerate config
	Here is a sample config for deepspeed zero3:
	compute_environment: LOCAL_MACHINE
	debug: false
	deepspeed_config:
	gradient_accumulation_steps: 1
	offload_optimizer_device: none
	offload_param_device: none
	zero3_init_flag: true
	zero3_save_16bit_model: true
	zero_stage: 3
	distributed_type: DEEPSPEED
	downcast_bf16: 'no'
	enable_cpu_affinity: false
	machine_rank: 0
	main_training_function: main
	mixed_precision: bf16
	num_machines: 1
	num_processes: 4
	rdzv_backend: static
	same_network: true
	tpu_env: []
	tpu_use_cluster: false
	tpu_use_sudo: false
	use_cpu: false
	3. check accelerate config:
	accelerate env
	4. Run the code:
	accelerate launch sample_finetune.py
	"""

	logger = logging.getLogger(__name__)


	###################
	# Hyper-parameters
	###################
	training_config = {
	"bf16": True,
	"do_eval": False,
	"learning_rate": 5.0e-06,
	"log_level": "info",
	"logging_steps": 20,
	"logging_strategy": "steps",
	"lr_scheduler_type": "cosine",
	"num_train_epochs": 1,
	"max_steps": -1,
	"output_dir": "./checkpoint_dir",
	"overwrite_output_dir": True,
	"per_device_eval_batch_size": 4,
	"per_device_train_batch_size": 4,
	"remove_unused_columns": True,
	"save_steps": 100,
	"save_total_limit": 1,
	"seed": 0,
	"gradient_checkpointing": True,
	"gradient_checkpointing_kwargs":{"use_reentrant": False},
	"gradient_accumulation_steps": 1,
	"warmup_ratio": 0.2,
	}

	peft_config = {
	"r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.05,
	"bias": "none",
	"task_type": "CAUSAL_LM",
	"target_modules": "all-linear",
	"modules_to_save": None,
	}
	train_conf = TrainingArguments(**training_config)
	peft_conf = LoraConfig(**peft_config)


	###############
	# Setup logging
	###############
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	handlers=[logging.StreamHandler(sys.stdout)],
	)
	log_level = train_conf.get_process_log_level()
	logger.setLevel(log_level)
	datasets.utils.logging.set_verbosity(log_level)
	transformers.utils.logging.set_verbosity(log_level)
	transformers.utils.logging.enable_default_handler()
	transformers.utils.logging.enable_explicit_format()

	# Log on each process a small summary
	logger.warning(
	f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
	+ f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
	)
	logger.info(f"Training/evaluation parameters {train_conf}")
	logger.info(f"PEFT parameters {peft_conf}")


	################
	# Model Loading
	################
	checkpoint_path = "microsoft/Phi-4-mini-instruct"
	model_kwargs = dict(
	use_cache=False,
	trust_remote_code=True,
	attn_implementation="flash_attention_2", # loading the model with flash-attention support
	torch_dtype=torch.bfloat16,
	device_map=None
	)
	model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
	tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
	tokenizer.model_max_length = 2048
	tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
	tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
	tokenizer.padding_side = 'right'


	##################
	# Data Processing
	##################
	def apply_chat_template(
	example,
	tokenizer,
	):
	messages = example["messages"]
	example["text"] = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=False)
	return example


	train_dataset, test_dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split=["train_sft", "test_sft"])
	column_names = list(train_dataset.features)

	processed_train_dataset = train_dataset.map(
	apply_chat_template,
	fn_kwargs={"tokenizer": tokenizer},
	num_proc=10,
	remove_columns=column_names,
	desc="Applying chat template to train_sft",
	)

	processed_test_dataset = test_dataset.map(
	apply_chat_template,
	fn_kwargs={"tokenizer": tokenizer},
	num_proc=10,
	remove_columns=column_names,
	desc="Applying chat template to test_sft",
	)


	###########
	# Training
	###########
	trainer = SFTTrainer(
	model=model,
	args=train_conf,
	peft_config=peft_conf,
	train_dataset=processed_train_dataset,
	eval_dataset=processed_test_dataset,
	max_seq_length=2048,
	dataset_text_field="text",
	tokenizer=tokenizer,
	packing=True
	)
	train_result = trainer.train()
	metrics = train_result.metrics
	trainer.log_metrics("train", metrics)
	trainer.save_metrics("train", metrics)
	trainer.save_state()


	#############
	# Evaluation
	#############
	tokenizer.padding_side = 'left'
	metrics = trainer.evaluate()
	metrics["eval_samples"] = len(processed_test_dataset)
	trainer.log_metrics("eval", metrics)
	trainer.save_metrics("eval", metrics)


	# ############
	# # Save model
	# ############
	trainer.save_model(train_conf.output_dir)