YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

Abstract

We present the BARD dataset (Basketball Action Recognition Dataset). It is designed to advance video action recognition in basketball through high-quality annotations and enriched contextual data. BARD improves upon existing datasets by including player jersey numbers, team colors and a novel output format supporting multi-label classification. To ensure annotation quality, we conducted a human validation study on a subsample of the annotations, with expert reviewers assessing the labeling quality and reporting the evaluation results, thereby providing human validated independent benchmarks. Moreover, in addition to standard caption-based action recognition metrics, we introduce Basketball Caption Evaluation Framework (BaCEF), a new application-oriented evaluation framework. Finally, to demonstrate the quality and challenging nature of the dataset, as well as the utility of our evaluation framework and its potential applications, we evaluate both proprietary models (e.g., Gemini 2.5 Pro) and open-source models (Qwen2.5-VL-7B-Instruct, Qwen2.5-VL-3B-Instruct), including BQwen2.5-VL-3B, a BARD fine-tuned variant of Qwen2.5-VL-3B-Instruct, across our defined benchmarks.

Project Git Folder

The official repository of the project is https://github.com/GabrieleGiudic/BARD

Setup

For the general setup look at https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct

Call the model according to our setup

import os
import json
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from tqdm import tqdm
import warnings
from qwen_vl_utils import process_vision_info

# Suppress a known transformers warning
#warnings.filterwarnings("ignore", message="The models vision encoder did not receive absolute position embeddings")

# --- 1. Configuration ---
MODEL_PATH = "GabrieleGiudici/BQwen2.5-VL-3B"
##get the file from BARD https://github.com/GabrieleGiudic/BARD/tree/master/validation/2025
DATA_FILE = "./data2025/model_input_dataset_color_number_2025.json"

OUTPUT_DIR = "./output"
PREDICTIONS_FILE = os.path.join(OUTPUT_DIR, "action_caption.json")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def load_model_and_processor():
    """Loads the Qwen model and processor."""
    print(f"?? Loading fine-tuned model from: {MODEL_PATH}")
    
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True
    ).eval()
    
    
    min_pixels = 256 * 28 * 28
    max_pixels = 448  * 28 * 28#1280 * 28 * 28
    processor = AutoProcessor.from_pretrained(MODEL_PATH, min_pixels=min_pixels, max_pixels=max_pixels
    )

    print("? Model and processor loaded successfully.")
    return model, processor

def main():
    """Main function to run inference and save predictions."""

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    model, processor = load_model_and_processor()
    
    print(f"?? Loading data from {DATA_FILE}...")
    with open(DATA_FILE, 'r') as f:
        dataset = json.load(f)
        
    results = []
    print(f"\n?? Starting inference on {len(dataset)} videos...")
    
    fps = 3.0
    for item in tqdm(dataset, desc="Processing videos"):
        video_path = item['video']
        
        human_prompt = ""
        for conv in item['conversations']:
            if conv['from'] == 'human':
                human_prompt = conv['value'].replace('<video>\n', '').strip()
                break
        
        if not os.path.exists(video_path):
            print(f"?? Warning: video not found at {video_path}. Skipping.")
            continue
            
        if not human_prompt:
            print(f"?? Warning: No human prompt found for {video_path}. Skipping.")
            continue


        # Prepare model inputs
        messages = [{"role": "user", "content": [{"type": "video", "video": video_path,"resized_height": 420,
                "resized_width": 784,"fps": fps,}, {"type": "text", "text": human_prompt}]}]
        
        # Preparation for inference
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        # Process the vision info
        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

        
        # Pass through processor
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
            **video_kwargs,
        )

        # Move to CUDA
        inputs = inputs.to("cuda")
        

        # Generate response
        generated_ids = model.generate(**inputs, max_new_tokens=400, do_sample=False)
        generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        response = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

        results.append({
            "image": video_path,
            "ground_truth": item['conversations'][1]['value'],
            "prediction": response
        })
        
        print(results[-1])

    with open(PREDICTIONS_FILE, 'w') as f:
        json.dump(results, f, indent=4)
        
    print(f"\n? Inference complete. Predictions saved to {PREDICTIONS_FILE}")

if __name__ == "__main__":
    main()```

---
license: apache-2.0
language:
- en
base_model:
- Qwen/Qwen2.5-VL-3B-Instruct
pipeline_tag: video-text-to-text
---

Downloads last month: 1

Safetensors

Model size

4B params

Tensor type

BF16

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support