Abstract
We present the BARD dataset (Basketball Action Recognition Dataset). It is designed to advance video action recognition in basketball through high-quality annotations and enriched contextual data. BARD improves upon existing datasets by including player jersey numbers, team colors and a novel output format supporting multi-label classification. To ensure annotation quality, we conducted a human validation study on a subsample of the annotations, with expert reviewers assessing the labeling quality and reporting the evaluation results, thereby providing human validated independent benchmarks. Moreover, in addition to standard caption-based action recognition metrics, we introduce Basketball Caption Evaluation Framework (BaCEF), a new application-oriented evaluation framework. Finally, to demonstrate the quality and challenging nature of the dataset, as well as the utility of our evaluation framework and its potential applications, we evaluate both proprietary models (e.g., Gemini 2.5 Pro) and open-source models (Qwen2.5-VL-7B-Instruct, Qwen2.5-VL-3B-Instruct), including BQwen2.5-VL-3B, a BARD fine-tuned variant of Qwen2.5-VL-3B-Instruct, across our defined benchmarks.
Project Git Folder
The official repository of the project is https://github.com/GabrieleGiudic/BARD
Setup
For the general setup look at https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct
Call the model according to our setup
import os
import json
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from tqdm import tqdm
import warnings
from qwen_vl_utils import process_vision_info
# Suppress a known transformers warning
#warnings.filterwarnings("ignore", message="The models vision encoder did not receive absolute position embeddings")
# --- 1. Configuration ---
MODEL_PATH = "GabrieleGiudici/BQwen2.5-VL-3B"
##get the file from BARD https://github.com/GabrieleGiudic/BARD/tree/master/validation/2025
DATA_FILE = "./data2025/model_input_dataset_color_number_2025.json"
OUTPUT_DIR = "./output"
PREDICTIONS_FILE = os.path.join(OUTPUT_DIR, "action_caption.json")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def load_model_and_processor():
"""Loads the Qwen model and processor."""
print(f"?? Loading fine-tuned model from: {MODEL_PATH}")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_PATH,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True
).eval()
min_pixels = 256 * 28 * 28
max_pixels = 448 * 28 * 28#1280 * 28 * 28
processor = AutoProcessor.from_pretrained(MODEL_PATH, min_pixels=min_pixels, max_pixels=max_pixels
)
print("? Model and processor loaded successfully.")
return model, processor
def main():
"""Main function to run inference and save predictions."""
os.makedirs(OUTPUT_DIR, exist_ok=True)
model, processor = load_model_and_processor()
print(f"?? Loading data from {DATA_FILE}...")
with open(DATA_FILE, 'r') as f:
dataset = json.load(f)
results = []
print(f"\n?? Starting inference on {len(dataset)} videos...")
fps = 3.0
for item in tqdm(dataset, desc="Processing videos"):
video_path = item['video']
human_prompt = ""
for conv in item['conversations']:
if conv['from'] == 'human':
human_prompt = conv['value'].replace('<video>\n', '').strip()
break
if not os.path.exists(video_path):
print(f"?? Warning: video not found at {video_path}. Skipping.")
continue
if not human_prompt:
print(f"?? Warning: No human prompt found for {video_path}. Skipping.")
continue
# Prepare model inputs
messages = [{"role": "user", "content": [{"type": "video", "video": video_path,"resized_height": 420,
"resized_width": 784,"fps": fps,}, {"type": "text", "text": human_prompt}]}]
# Preparation for inference
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# Process the vision info
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
# Pass through processor
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
**video_kwargs,
)
# Move to CUDA
inputs = inputs.to("cuda")
# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=400, do_sample=False)
generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
response = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
results.append({
"image": video_path,
"ground_truth": item['conversations'][1]['value'],
"prediction": response
})
print(results[-1])
with open(PREDICTIONS_FILE, 'w') as f:
json.dump(results, f, indent=4)
print(f"\n? Inference complete. Predictions saved to {PREDICTIONS_FILE}")
if __name__ == "__main__":
main()```
---
license: apache-2.0
language:
- en
base_model:
- Qwen/Qwen2.5-VL-3B-Instruct
pipeline_tag: video-text-to-text
---
- Downloads last month
- 1