Spaces:

hudsongouge
/

DAT-Byte-Demo

Sleeping

App Files Files Community

DAT-Byte-Demo / inference /inference.py

hudsongouge

Update space

adf0368 6 months ago

raw

history blame contribute delete

10.9 kB

	import torch
	import torch.nn.functional as F
	import os
	import torch.quantization
	from .model import (
	DiffTransformerLLM,
	ByteTokenizer,
	IM_START_TOKEN,
	IM_END_TOKEN,
	PAD_TOKEN,
	)

	force_CPU = True


	def list_checkpoints(checkpoint_dir="checkpoints"):
	"""List all available checkpoints in the directory."""
	if not os.path.exists(checkpoint_dir):
	print(f"Checkpoint directory {checkpoint_dir} not found.")
	return []

	checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pt")]
	return sorted(checkpoints)


	def load_model(checkpoint_path, device=None, fp16=True):
	"""Load a trained model from a checkpoint, applying optimizations as needed."""
	import torch

	if device is None:
	device = torch.device(
	"cuda" if torch.cuda.is_available() and not force_CPU else "cpu"
	)

	print(f"Loading checkpoint from {checkpoint_path}")
	checkpoint = torch.load(checkpoint_path, map_location="cpu")

	# Hyperparams
	vocab_size = 259 # 256 bytes + 3 special tokens
	embed_dim = 768
	num_layers = 28
	num_heads = 12
	ffn_hidden_dim = embed_dim * 4
	max_seq_len = 512
	dropout = 0.1 # For inference you can set dropout=0

	# Model
	model = DiffTransformerLLM(
	vocab_size=vocab_size,
	embed_dim=embed_dim,
	num_layers=num_layers,
	num_heads=num_heads,
	ffn_hidden_dim=ffn_hidden_dim,
	max_seq_len=max_seq_len,
	dropout=dropout,
	)

	# The checkpoint is the state dict itself
	state_dict = checkpoint

	# Load the state dict into the float32 model first
	model.load_state_dict(state_dict)
	model.eval()

	# Apply device-specific optimizations
	if device.type == "cpu":
	print("Optimizing for CPU with dynamic quantization (int8).")
	# Set the quantization engine
	torch.backends.quantized.engine = "qnnpack"
	# Quantize the linear layers to int8 for performance
	model = torch.quantization.quantize_dynamic(
	model, {torch.nn.Linear}, dtype=torch.qint8
	)
	elif device.type == "cuda" and fp16:
	print("Casting model to fp16 for CUDA.")
	model = model.half()

	model = model.to(device)

	print("Model loaded successfully.")
	return model


	def generate_text(
	model,
	tokenizer,
	prompt,
	max_new_tokens=100,
	temperature=1.0,
	top_k=0,
	top_p=0.9,
	repetition_penalty=1.0,
	device=None,
	stop_sequences=[],
	):
	"""
	Generate text from a prompt using the trained model.

	Args:
	model: The trained DiffTransformerLLM model
	tokenizer: ByteTokenizer instance
	prompt: Text prompt to start generation (as a string)
	max_new_tokens: Maximum number of new tokens to generate
	temperature: Controls randomness. Lower is more deterministic.
	top_k: If > 0, only sample from the top k most likely tokens
	top_p: If > 0, sample from the smallest set of tokens whose cumulative probability exceeds p
	repetition_penalty: Penalize repetition. 1.0 means no penalty.
	device: Device to run inference on

	Returns:
	The generated text as a string
	"""
	if device is None:
	device = torch.device(
	"cuda" if torch.cuda.is_available() and not force_CPU else "cpu"
	)

	# Convert prompt to bytes and tokenize - process as-is without adding special tokens
	prompt_bytes = prompt.encode("utf-8", errors="replace")
	input_ids = (
	torch.tensor(
	tokenizer.encode(prompt_bytes, add_special_tokens=False), dtype=torch.long
	)
	.unsqueeze(0)
	.to(device)
	)
	stop_sequences = [
	tokenizer.encode(
	seq.encode("utf-8", errors="replace"), add_special_tokens=False
	)
	for seq in stop_sequences
	]

	# Track generated token IDs
	generated_ids = input_ids.clone()
	generated_bytes = b""

	# Set the model to evaluation mode
	model.eval()

	with torch.no_grad():
	for _ in range(max_new_tokens):
	# Only use the last max_seq_len tokens if we exceed the model's context length
	if generated_ids.size(1) > model.max_seq_len:
	input_ids = generated_ids[:, -model.max_seq_len :]
	else:
	input_ids = generated_ids

	# Forward pass to get logits for the next token
	logits = model(input_ids)

	# Get logits for the next token (last position)
	next_token_logits = logits[:, -1, :].squeeze(0)

	# Apply temperature
	if temperature > 0:
	next_token_logits = next_token_logits / temperature

	# Apply repetition penalty
	if repetition_penalty > 1.0:
	for token_id in set(generated_ids[0].tolist()):
	next_token_logits[token_id] /= repetition_penalty

	# Apply top-k filtering
	if top_k > 0:
	top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
	next_token_logits = torch.full_like(next_token_logits, float("-inf"))
	next_token_logits.scatter_(0, top_k_indices, top_k_logits)

	# Apply top-p (nucleus) filtering
	if 0 < top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(
	next_token_logits, descending=True
	)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=0), dim=0)

	# Remove tokens with cumulative probability above the threshold
	sorted_indices_to_remove = cumulative_probs > top_p
	# Shift the indices to the right to keep the first token above the threshold
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
	..., :-1
	].clone()
	sorted_indices_to_remove[..., 0] = 0

	indices_to_remove = sorted_indices[sorted_indices_to_remove]
	next_token_logits[indices_to_remove] = float("-inf")

	# Sample from the filtered distribution
	probs = F.softmax(next_token_logits, dim=0)
	next_token = torch.multinomial(probs, 1)

	# Append the generated token to the sequence
	generated_ids = torch.cat([generated_ids, next_token.unsqueeze(0)], dim=1)
	# Check if IM_END_TOKEN has been generated
	token_bytes = tokenizer.decode([next_token.item()])
	generated_bytes += token_bytes
	try:
	print(token_bytes.decode("utf-8", errors="replace"), end="", flush=True)
	except Exception as e:
	print(f"<Error decoding token: {e}>", end="", flush=True)
	stop_generated = False
	stop_seq = None
	for stop_seq in stop_sequences:
	if generated_ids.tolist()[0][-len(stop_seq) :] == stop_seq:
	stop_generated = True
	break
	if stop_generated:
	# Remove the stop sequence from the generated IDs
	generated_ids = generated_ids[:, : -len(stop_seq)]
	generated_bytes = generated_bytes[: -len(stop_seq)]
	break

	# Decode to bytes and then to string
	try:
	generated_text = generated_bytes.decode("utf-8", errors="replace")
	except Exception as e:
	print(f"\nError decoding generated text: {e}")
	generated_text = "<decoding error>"

	return generated_text, prompt + generated_text


	def main():
	parser = argparse.ArgumentParser(
	description="Text generation with DiffAttention LLM"
	)
	parser.add_argument("--checkpoint", type=str, help="Path to the checkpoint file")
	parser.add_argument(
	"--prompt",
	type=str,
	default="""\nHow many 'b's are in "barber"? \n""",
	)
	parser.add_argument(
	"--max_tokens",
	type=int,
	default=500,
	help="Maximum number of tokens to generate",
	)
	parser.add_argument(
	"--temperature", type=float, default=0.7, help="Sampling temperature"
	)
	parser.add_argument(
	"--top_k", type=int, default=10, help="Top-k sampling parameter (0 to disable)"
	)
	parser.add_argument(
	"--top_p",
	type=float,
	default=0.9,
	help="Top-p (nucleus) sampling parameter (0 to disable)",
	)
	parser.add_argument(
	"--repetition_penalty",
	type=float,
	default=1.2,
	help="Repetition penalty (1.0 for no penalty)",
	)
	parser.add_argument(
	"--list_checkpoints",
	action="store_true",
	help="List available checkpoints and exit",
	)
	args = parser.parse_args()

	# List checkpoints if requested
	if args.list_checkpoints:
	print("Available checkpoints:")
	checkpoints = list_checkpoints()
	for i, ckpt in enumerate(checkpoints):
	print(f"{i+1}. {ckpt}")
	return

	# If no checkpoint specified, use the latest one
	if not args.checkpoint:
	checkpoints = list_checkpoints()
	if not checkpoints:
	print("No checkpoints found. Please train the model first.")
	return

	# Find the latest epoch_end checkpoint
	end_checkpoints = [ckpt for ckpt in checkpoints if "end.pt" in ckpt]
	if end_checkpoints:
	latest_checkpoint = max(end_checkpoints)
	else:
	latest_checkpoint = max(checkpoints)

	checkpoint_path = os.path.join("checkpoints", latest_checkpoint)
	else:
	checkpoint_path = args.checkpoint

	# Set device
	device = torch.device(
	"cuda" if torch.cuda.is_available() and not force_CPU else "cpu"
	)
	print(f"Using device: {device}")

	# Initialize tokenizer
	tokenizer = ByteTokenizer()

	# Load model
	model = load_model(checkpoint_path, device)

	# Generate text
	print(f"\nGenerating text with prompt: '{args.prompt}'")
	print(
	f"Parameters: temperature={args.temperature}, top_k={args.top_k}, top_p={args.top_p}, repetition_penalty={args.repetition_penalty}"
	)
	print("\nGenerating...")

	generated_text, full_text = generate_text(
	model=model,
	tokenizer=tokenizer,
	prompt=args.prompt,
	max_new_tokens=args.max_tokens,
	temperature=args.temperature,
	top_k=args.top_k,
	top_p=args.top_p,
	repetition_penalty=args.repetition_penalty,
	device=device,
	)

	print("\n\nGenerated completion only:")
	print("-" * 40)
	print(generated_text)
	print("-" * 40)

	print("\nFull generated text (prompt + completion):")
	print("-" * 40)
	print(full_text)
	print("-" * 40)


	if __name__ == "__main__":
	import argparse

	main()