morgendave commited on
Commit
080502f
·
verified ·
1 Parent(s): a00bee9

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Llama4ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_chunk_size": null,
7
+ "attention_dropout": 0.0,
8
+ "attn_scale": 0.1,
9
+ "attn_temperature_tuning": true,
10
+ "attn_temperature_tuning_floor": true,
11
+ "bos_token_id": 200000,
12
+ "cache_implementation": "static",
13
+ "draft_vocab_size": 202048,
14
+ "eos_token_id": 200001,
15
+ "floor_scale": 8192,
16
+ "for_llm_compressor": false,
17
+ "head_dim": 128,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 5120,
20
+ "initializer_range": 0.02,
21
+ "interleave_moe_layer_step": 0,
22
+ "intermediate_size": 8192,
23
+ "intermediate_size_mlp": 16384,
24
+ "max_position_embeddings": 262144,
25
+ "model_type": "llama4_text",
26
+ "moe_layers": [],
27
+ "no_rope_layers": [
28
+ 1,
29
+ 1,
30
+ 1
31
+ ],
32
+ "num_attention_heads": 40,
33
+ "num_experts_per_tok": 1,
34
+ "num_hidden_layers": 3,
35
+ "num_key_value_heads": 8,
36
+ "num_local_experts": 0,
37
+ "output_router_logits": false,
38
+ "pad_token_id": 200018,
39
+ "rms_norm_eps": 1e-05,
40
+ "rope_scaling": null,
41
+ "rope_theta": 500000.0,
42
+ "router_aux_loss_coef": 0.001,
43
+ "router_jitter_noise": 0.0,
44
+ "sliding_window": null,
45
+ "tie_word_embeddings": false,
46
+ "torch_dtype": "bfloat16",
47
+ "transformers_version": "4.52.0.dev0",
48
+ "use_cache": true,
49
+ "use_qk_norm": false,
50
+ "use_sliding_window": false,
51
+ "vocab_size": 202048,
52
+ "yoco_global_kv_layer": null,
53
+ "yoco_local_kv_layer": null
54
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 200000,
4
+ "cache_implementation": "static",
5
+ "eos_token_id": 200001,
6
+ "pad_token_id": 200018,
7
+ "transformers_version": "4.52.0.dev0"
8
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0b33783e0c60d299cbed7a2322617634599b2d5181e243d33904b667f2e2f07
3
+ size 4061341136
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d100638c48b8d94f925c323bc5c10d0e0635c6ea3d97758bea9a45cd8dacc1ad
3
+ size 2068971648
model.safetensors.index.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6130309120
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.fc.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.2.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.2.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.2.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.norm.weight": "model-00001-of-00002.safetensors"
37
+ }
38
+ }
params.json.raw ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aggregate_gradients_by_tokens": true,
3
+ "alternate_pp_config": true,
4
+ "async_batch_iterator": false,
5
+ "async_batch_iterator_timeout_s": 300,
6
+ "async_checkpointing": true,
7
+ "async_eval_ngpus": -1,
8
+ "attach_debugpy": false,
9
+ "background_nccl_init": false,
10
+ "batch_p2p_communication": false,
11
+ "batch_size": 2,
12
+ "cached_file_unique_prefix": "",
13
+ "checkpoint": {
14
+ "async_checkpointing_staging_method": "async_copy_async_serialize",
15
+ "barrier_timeout_secs": 120,
16
+ "checkpoint_barrier_type": "sc",
17
+ "checkpoint_gc_use_rmdir": true,
18
+ "checkpoint_groups": false,
19
+ "checkpoint_server_max_attempts": 10,
20
+ "checkpoint_server_num_chunks": 10,
21
+ "checkpoint_server_num_threads": 20,
22
+ "checkpoint_server_op_timeout_secs": 10.0,
23
+ "checkpoint_server_threads": 10,
24
+ "checkpoint_server_timeout_secs": 60.0,
25
+ "dump_freq_ephemeral": -1,
26
+ "eager_init_staging_buffer": true,
27
+ "live_checkpointing": false,
28
+ "on_demand_checkpointing": false,
29
+ "sleep_interval": 10,
30
+ "staging_block_every_n_tensors": -1,
31
+ "timeout_all_shard_exists": 300,
32
+ "timeout_barrier_init_secs": 300,
33
+ "timeout_execution": 1800,
34
+ "timeout_folder_exists": 300,
35
+ "timeout_process_init_secs": 60,
36
+ "use_checkpoint_barrier_tcpstore_libuv": true,
37
+ "use_checkpoint_barrier_wait_for_all_files": true,
38
+ "use_checkpoint_barrier_wait_for_dir": false,
39
+ "use_checkpointing_process": true,
40
+ "use_shm_manager_for_async_cp": false,
41
+ "wait_for_tensor_timeout_s": 120
42
+ },
43
+ "checkpoint_dump_dir": "/mnt/wsfuse/outputs/T-draft-17bx128MoE-N_3-b6pd09653zgfw",
44
+ "collect_et": false,
45
+ "context_parallel_size": 1,
46
+ "data": "",
47
+ "dataloader": {
48
+ "always_trim_text": true,
49
+ "concurrency_timeout_s": 300,
50
+ "concurrent": false,
51
+ "datamix": "/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/sql_sft:0.0001140800,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/scale_code_chunk:0.0000850100,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/sft_data_surge_tree_sitter_top_3:0.0933991500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/sft_data_scale_tree_sitter_top_3:0.0055914500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/turing_tree_sitter_top_3:0.0003080800,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/olivier_synthetic_code:0.0095646900,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/suchin_synthetic_code:0.0234710400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/olivier_synthetic_javascript:0.0020783500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/codeforces_few_shot_000:0.0006576000,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/synth_data_cleaning_0920_online_judge_sft:0.0003063400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/sten_surge_coding_with_exec_mulitpl_synth_240613_v2_format_clean:0.0006246500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/yundi_codeforces_rewrite_filter_solve_coding_wrapper_sft_format_clean:0.0004503500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/olivier_coding_synthetic_stackoverflow_inspired_samll_multipl_translation_v2_format_clean:0.0026521600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/sten_coding_generated_problem_stack_overflow_L3_405B_self_healing_principled_tests_v8_good_v2_format_clean:0.0050457400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/iopairs_snippets_filtered_inductive_reasoning:0.0008189600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_output:0.0022750500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_input:0.0015900500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/coding/debug_v5:0.0009672300,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/general_helpfulness_english/bio_sft_data2:0.0000024200,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/general_helpfulness_english/bio_sft_data_systemprompt2:0.0000033400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/general_helpfulness_english/hard_legal_mcq_w_reasoning_sft:0.0011769300,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/autoif_filtered_prompt_v1_v2_filtered_pass_75_format_filter_valid_link_fix_code:0.0011995400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rewrite_iter1_perfect_frr_tone_filter_format_clean_chunk:0.0201362100,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rlhf6pt5_rs_perfect_frr_tone_filter_format_clean_chunk:0.0330838000,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/scale_weak_areas:0.0004247400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/knn_mitigation_1shot_v1:0.0092553500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_hard_chunked:0.0047733600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_medium_chunked:0.0471786600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_hard_chunked:0.0044165200,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_medium_chunked:0.0264769000,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_general_steerability_2024_train_chunk:0.0018149600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R3/general_helpfulness_english/table_yonder_oss_helpfulness_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0000572800,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/surge_reasoning:0.0026246000,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/round3_3p_decontaminated_rlhf6_mcq_rscot_50_cjka_fix_nomath_nobio:0.0203449500,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/up_synthetic_verbal_reasoning_405_highq185_few_shot_000:0.0011054200,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/legal_mbe_bar_few_shot_000_format_v3:0.0000727600,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/cpa_few_shot_000_cot_000_format_final_v3:0.0000278400,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/moral_decontaminate:0.0000427200,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/lovish_round6_v2_format_clean:0.0003930200,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/ctg_clean_downsampled0_5_mcq_no_geeks_v3:0.0002844500,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_aime_esbs_100_0_75_M30_SCORED_DECONTAM:0.0003522600,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_harp_esbs_100_0_75_M30_SCORED_DECONTAM:0.0017419500,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_math_train_esbs_100_0_75_M30_SCORED_DECONTAM:0.0045896500,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_omni-math_esbs_100_0_75_M30_SCORED_DECONTAM:0.0011989900,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v1_esbs_100_0_75_M30_SCORED_DECONTAM:0.0139459000,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v2_esbs_100_0_75_M30_SCORED_DECONTAM:0.0175181900,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v3_esbs_100_0_75_M30_SCORED_DECONTAM:0.0095064700,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v4_esbs_100_0_75_M30_SCORED_DECONTAM:0.0072784800,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aime_N20_VERIFIED_SCORED_DECONTAM:0.0001480700,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aops_N20_VERIFIED_SCORED_DECONTAM:0.0041684300,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_harp_N20_VERIFIED_SCORED_DECONTAM:0.0010190100,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_math_N20_VERIFIED_SCORED_DECONTAM:0.0030298100,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_omni_math_N20_VERIFIED_SCORED_DECONTAM:0.0005474500,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v1_N20_VERIFIED_SCORED_DECONTAM:0.0098020300,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v2_N20_VERIFIED_SCORED_DECONTAM:0.0136693500,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v3_N20_VERIFIED_SCORED_DECONTAM:0.0079475700,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v4_N20_VERIFIED_SCORED_DECONTAM:0.0047879200,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250114_r1ab_data:0.0009782000,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/brainly_arpg_weak_area_mcq_mitigated_final:0.0006048700,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/reasoning_sft_mcq_final_sbs:0.0006742800,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250106_r1_data_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0064538600,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/careers360_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0061167600,/mnt/wsfuse/finetune/shared/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/afanti_40k_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0090247200,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/multilingual/multilingual_r6_5_mix_rm_mathv1:0.0332281300,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/multilingual/surge_sft_hindi_romanized:0.0003671600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/multilingual/scale_sft_final_format_clean_chunk:0.0081132900,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/multilingual/bio:0.0002783100,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT3R2/multilingual/new_rs_souped_multilinugal_critic_rewrite_data_format_clean:0.0648830400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/multilingual/rus_v7:0.0206462400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/multilingual/zho_v5:0.0224838300,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_mhx_online_positives_exp_v1_msgv2_pii_vr_frr_filtered_formatting_linted_mk_partition_1_eq_1:0.0999299400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/factuality/table_mh20_2_message_v2_mhx_sft_train_mh17_rlhf2_preference_factuality_perturb_v2_partition_1_eq_1:0.0317194500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/factuality/table_mh20_2_message_v2_mhx_sft_train_mh16_crs_hallucination_partition_1_eq_1:0.0000143900,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/factuality/table_mh20_2_message_v2_mhx_sft_train_mh15_synth_defamation_v0_diverse_partition_1_eq_1:0.0001121100,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/factuality/table_mh20_2_message_v2_synthetic_private_individuals_data_final_partitioned_partition_1_eq_1:0.0001191900,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_mh20_2_message_v2_meta_ai_mhx_capability_hallucination_for_option_2_sev_compliant_partitioned_partition_source_eq_capability_prompt_k_and_structured_v7_tone_transform_conserved:0.0000013000,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_mh20_2_message_v2_mhx_sft_train_mh16_adversarial_capability_hallucination_partition_1_eq_1:0.0000390300,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_mh20_2_message_v2_mhx_sft_train_imagine_synthetic_v0_partition_1_eq_1:0.0021983400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/reasoning/table_mh20_2_message_v2_mhx_sft_train_aqua_synthetic_cot_sft_data_v1_selfchecked_partition_1_eq_1:0.0069630200,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_syngen_en_mh24_sft_fRM_by_jtbd_filtered_message_fixed_v2_partition_ds_eq_2024-11-18:0.0104362700,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_mh22_formatting_nested_list_sft_message_v2_partition_ds_eq_2024-09-30:0.0000701200,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_mh22_formatting_nested_list_steerability_sft_message_v2_partition_ds_eq_2024-09-30:0.0000880400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_flyrs_en_essay_email_formatting_6_0_mh21_linted_message_v2_partitioned_partition_1_eq_1:0.0000523500,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/safety/table_mh20_2_message_v2_mhx_sft_train_scale_precise_instruct_off_the_shelf_expanded_partition_ds_eq_2024-08-05:0.0007582100,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_prod_if_rjs_sft_v1_partition_source_eq_prod_if_tool:0.0040049900,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_prod_if_rjs_sft_v1_partition_source_eq_prod_if_non_tool:0.0053242800,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_prod_if_rjs_sft_v1_partition_source_eq_prod_if_tool_hard_train:0.0000091400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/multilingual/table_mh20_2_message_v2_mh6_exp_datasets_unified_multilingual_for_option_2_sev_compliant_partitioned_partition_source_eq_mh18_multilingual_redaction_fix:0.0000002000,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/multilingual/table_mh20_2_message_v2_mh6_exp_datasets_unified_multilingual_for_option_2_sev_compliant_partitioned_partition_source_eq_english_lmr_fix_mh18:0.0000005700,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/tooling/table_mh20_2_message_v2_fbsearch_followup_refusal_0703_v1_partition_source_eq_1p_search_summary_history_injection:0.0000243400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/tooling/table_mh20_2_message_v2_mhx_sft_train_reels_summarization_mh7_partition_1_eq_1:0.0044526900,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_p13n_mh_sft_11_11_mrspdp_msg_v2_partition_ds_eq_2024-11-11:0.0012578800,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_p13n_mh_sft_11_11_memory_msg_v2_partition_ds_eq_2024-11-11:0.0005490800,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_p13n_mh_sft_11_11_canonical_msg_v2_partition_ds_eq_2024-11-11:0.0005627400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_ai_memory_explic_intent_response_train_500_partition_1_eq_1:0.0005429000,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/multilingual/table_research_sft_data_multilingual_oct_langs_only_with_sys_prompt_randomized_date_msg_v2_partition_ds_eq_2024-09-28:0.0088870400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/multilingual/table_multilingual_prod_traffic_10_11_vn_th_id_ar_lang_v3_rjs_lmr_out_msg_v2_cs_fix_for_option_2_compliant_lid13_lmr_clean_partition_ds_eq_2024-10-14:0.0012500600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/multilingual/table_multilingual_oct_lang_flywheel_refresh_rjs_ranked_lid15_cs_pii_filter_final_message_v2_partition_ds_eq_2024-11-15:0.0020233600,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_syngen_yonder0_data_replenish_sft_partition_1_eq_1:0.0158649800,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/multilingual/table_mh20_2_message_v2_mh6_exp_datasets_unified_multilingual_for_option_2_sev_compliant_partitioned_redaction_fix_partition_source_eq_mh18_multilingual_redaction_fix:0.0001173400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/multilingual/table_mh20_2_message_v2_mh6_exp_datasets_unified_multilingual_for_option_2_sev_compliant_partitioned_english_lmr_fix_partition_source_eq_english_lmr_fix_mh18:0.0000001400,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_yonder_flywheel_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0017173800,/mnt/wsfuse/finetune/shared/YonderTrains/12M/LT2R3/metaai/general_helpfulness_english/table_yonder_flywheel_syngen_preachy_tone_sft_privacy_safe_partition_is_partition_eq_true:0.0009091400,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/flyrs_en_essay_email_formatting_msgv2_v0:0.0000433000,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/flyrs_mlt_4_0_dpo_mh23_1_lmr_msgv2_v0:0.0001682800,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/mh20_2_message_v2_mhx_sft_train_sg_capability_canned_responses_v1_msgv2_v0:0.0000356300,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/mh20_2_message_v2_mhx_sft_train_sg_non_plugins_mh15_msgv2_v0:0.0001764900,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/mh24_voice_data_flywheel_tts_filtering_top_candidates_msgv2_v0:0.0017267900,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/mhx_non_user_sft_data_rlhf6_msgv2_v0:0.0546009800,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v2/mhx_non_user_sft_data_rlhf6_msgv2_v0_batch1:0.1165484600,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/multilingual_oct_lang_flywheel_refresh_rjs_msgv2_v0:0.0004449600,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/p13n_mh_sft_11_11_memory_msg_msgv2_v0:0.0002283200,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/factual_dpo_11_18_factuality_ace_preference_claim_support_v0:0.0029266800,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/factuality_ace_edit_preference_train_v1_join_enrich_1119_v0:0.0014881800,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/mh20_2_message_v2_mhx_sft_train_aqua_synthetic_cot_sft_data_v1_selfchecked_v0:0.0012164800,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/mh_safety_sft_data_safety_mlg_oct_lang_frr_0916_lid13_lmr_clean_v0:0.0005607500,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/mhx_online_positives_exp_v1_msgv2_pii_vr_frr_filtered_v0:0.0147447700,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/research_sft_data_multilingual_oct_langs_only_with_sys_prompt_randomized_date_msg_v2_v0:0.0015526100,/mnt/wsfuse/samuelcai/pd/training_data/mh24_eagle_v0/raw_data/v1/syngen_en_mh24_dpo_fRM_by_jtbd_filtered_final_message_v2_v0:0.0035373300",
52
+ "enable_packing": true,
53
+ "image": null,
54
+ "load_only_tp_zero": false,
55
+ "logging_config": {
56
+ "log_buffer_size": false,
57
+ "log_every_n_steps": 10,
58
+ "log_first_batch": false,
59
+ "log_full_dataloader_state": false,
60
+ "log_metadata": false
61
+ },
62
+ "max_world_size": null,
63
+ "mix_mode": {},
64
+ "modality_datamix": null,
65
+ "pad_mode": {
66
+ "pad_value": 200018,
67
+ "seq_len": 4096
68
+ },
69
+ "pin_memory": true,
70
+ "prefetch_factor": null,
71
+ "progress_reporter_log_interval": 0,
72
+ "rng_mode": {},
73
+ "shuffle_seed": 1337,
74
+ "simulate_training_budget": null,
75
+ "speech": null,
76
+ "split_mode": {
77
+ "discard_text_only": false,
78
+ "keep_interval": 2,
79
+ "keep_strategy": "put_back"
80
+ },
81
+ "tail_token_mode": {},
82
+ "video": null,
83
+ "weights_update_config": null,
84
+ "workers_per_gpu": 1
85
+ },
86
+ "dataset_iteration_limits": null,
87
+ "deallocate_pipeline_outputs": true,
88
+ "disable_logging": false,
89
+ "disable_workers_print": false,
90
+ "dtype": "bf16",
91
+ "dummy_nccl_init": true,
92
+ "dump_dir": "/mnt/wsfuse/outputs/T-draft-17bx128MoE-N_3-b6pd09653zgfw",
93
+ "dump_dir_tree_type": "sharded",
94
+ "dump_freq": 50,
95
+ "dump_profile_traces": true,
96
+ "eager_init": true,
97
+ "enable_anomaly_detection": false,
98
+ "enable_deterministic_training": false,
99
+ "enable_loss_tracker": true,
100
+ "enable_ods": true,
101
+ "enable_pynvml": false,
102
+ "et_end_itr": 15,
103
+ "et_start_itr": 12,
104
+ "eval_freq": -1,
105
+ "exp_id": "",
106
+ "exp_name": "",
107
+ "expert_parallel_size": 32,
108
+ "finetuning_dir": "",
109
+ "fp32_reduce_scatter": "all",
110
+ "gc_collect_freq": 1000,
111
+ "gpu_check_level": -1,
112
+ "increase_seq": null,
113
+ "instruct": {
114
+ "no_loss_prompt": false,
115
+ "no_loss_truncated": false
116
+ },
117
+ "instruct_data": "",
118
+ "iter_jsonl": {
119
+ "buffer_size": 64,
120
+ "same_data": false
121
+ },
122
+ "iter_multi": {
123
+ "buffer_size": 64,
124
+ "ignore_extra_chunks": true,
125
+ "iterate_chunk_by_chunk": false,
126
+ "max_precompute": 20,
127
+ "multiprocess": true
128
+ },
129
+ "iter_text_airstore": {
130
+ "airstore_max_holding_bundles_limit": null,
131
+ "airstore_max_resharding_factor": null,
132
+ "airstore_sample_prefetch_limit": null,
133
+ "airstore_seed": 727,
134
+ "dataloader_workers_per_gpu": 0,
135
+ "load_only_pp_zero": false,
136
+ "load_only_tp_zero": false,
137
+ "max_world_size": null,
138
+ "pin_memory": true,
139
+ "prefetch_factor": null,
140
+ "simulate_training_budget": null,
141
+ "unique_token_fraction": null
142
+ },
143
+ "iter_type": "multi",
144
+ "keep_eval_checkpoints": false,
145
+ "keep_n_last_checkpoints": -1,
146
+ "load_optimizer_on_finetuning": false,
147
+ "log": {
148
+ "disable_scalars_tb_write": false,
149
+ "log_loss_tracker_to_scuba": false,
150
+ "log_scalar_default_log_level": "INFO",
151
+ "log_scalar_freq": 100,
152
+ "log_scalar_freq_overrides": "fp8:1000, router/modality:1000, router/dataset:1000, params:0,grads_fsdpv2:0,debug:0,verbose_debug:0",
153
+ "log_scalar_log_level_overrides": "",
154
+ "log_scalar_version": 2.0,
155
+ "log_scalars": false,
156
+ "log_scalars_to_ods": false,
157
+ "log_scalars_to_scuba": false,
158
+ "log_tb": true,
159
+ "log_tensors": false,
160
+ "log_tensors_to_scuba": false,
161
+ "online_wandb": false,
162
+ "online_wandb_project": null,
163
+ "online_wandb_team": null,
164
+ "reduce_scalars": false
165
+ },
166
+ "log_all_steps": true,
167
+ "log_batch_checksum": true,
168
+ "log_dataloader_state": false,
169
+ "log_freq": 1,
170
+ "log_position_in_data_queue": true,
171
+ "log_updates": true,
172
+ "logitwriter": {
173
+ "compression_algo": "zstd",
174
+ "enable": false,
175
+ "index_dtype": "int32",
176
+ "logit_dtype": "float32",
177
+ "same_day_logits_backup": false,
178
+ "speech_topk": 100,
179
+ "topk": 100,
180
+ "write_lse": true
181
+ },
182
+ "loss_logging_freq": 10,
183
+ "loss_rescaling": false,
184
+ "max_image_tiles_per_gpu": 2000,
185
+ "mb_recompute_attn": false,
186
+ "mb_recompute_fc1_fc3": false,
187
+ "mem_snapshot_max_entries": 100000,
188
+ "mem_snapshot_profiling_duration": 3,
189
+ "mem_snapshot_start_step": -1,
190
+ "mem_snapshot_stop_step": -1,
191
+ "memory_efficient_pipeline": false,
192
+ "model": {
193
+ "alpha_depth": "disabled",
194
+ "alpha_lrm": 1.0,
195
+ "alpha_on_resid": false,
196
+ "alpha_separate": false,
197
+ "alpha_wdm": 1.0,
198
+ "attn_bias_type": "block_causal",
199
+ "attn_dropout": 0,
200
+ "attn_out_dropout": 0,
201
+ "attn_temperature_tuning_floor_scale": null,
202
+ "attn_temperature_tuning_layers": null,
203
+ "attn_temperature_tuning_q_scale_constant": null,
204
+ "attn_to_keep": "all",
205
+ "batchify_local_attention_len": null,
206
+ "cp_attn_perdoc": false,
207
+ "cp_attn_save_global_kv": true,
208
+ "custom_bwd": false,
209
+ "custom_bwd_sum_first_then_comms": true,
210
+ "dialog_len": null,
211
+ "dim": 5120,
212
+ "efficient_attn": "auto",
213
+ "efficient_output": false,
214
+ "enable_fsdpv2": true,
215
+ "enable_tp_overlapping": false,
216
+ "enable_weight_sharding_in_pp": false,
217
+ "enable_wgrad_sharding_in_pp": false,
218
+ "eos_id": 200001,
219
+ "every_n_layers_nope": null,
220
+ "experts_choice_moe": {
221
+ "auto_scale_F": true,
222
+ "capacity_factor": 1.5,
223
+ "clamp_above_std": false,
224
+ "compute_moe_in_fp64": false,
225
+ "drop_and_pad": false,
226
+ "enable_lb_free": false,
227
+ "enable_lb_loss": false,
228
+ "enable_router_zloss": false,
229
+ "eval_threshold_std_mult": 0.0,
230
+ "eval_with_expert_activation_model": false,
231
+ "eval_with_saved_stats": true,
232
+ "eval_with_top_k": false,
233
+ "expert_act_grad_prop_coeff": 0,
234
+ "expert_act_init_std": 0.5,
235
+ "expert_act_loss_coeff": 0.0001,
236
+ "expert_act_silu": false,
237
+ "expert_act_threshold": 0,
238
+ "expert_activation_model": false,
239
+ "fc1_clamp": null,
240
+ "fc2_clamp": null,
241
+ "fc3_clamp": null,
242
+ "fix_datasource_router_score": "",
243
+ "fix_image_router_score": null,
244
+ "fix_speech_router_score": null,
245
+ "force_looped_impl": false,
246
+ "fused_shuffle": true,
247
+ "input_scaling": false,
248
+ "input_scaling_max_clamp": 2.0,
249
+ "input_scaling_min_clamp": -2.0,
250
+ "interleave_moe_layer_step": 2,
251
+ "is_enabled": false,
252
+ "lb_free_coeff": 0.0,
253
+ "lb_loss_coeff": 0.0,
254
+ "max_experts_per_token": null,
255
+ "moe_init_scale": 1.0,
256
+ "mult_moe_weight_grads": null,
257
+ "norm_expert_output": null,
258
+ "num_experts": 128,
259
+ "overlap_token_comm": true,
260
+ "postgate_experts": false,
261
+ "recompute_capacity_factor": null,
262
+ "routed_dropout": 0.0,
263
+ "router_clamp": null,
264
+ "router_kld_reg": 0.0,
265
+ "router_padding_coeff": null,
266
+ "router_score_gating": "sigmoid",
267
+ "router_zloss_coeff": 1.3143357982572078e-18,
268
+ "running_stats_ema": 0.99,
269
+ "running_stats_sync_freq": 100,
270
+ "saved_thresholds_are_post_sigmoid": false,
271
+ "sharding_strategy": "dp2ep-v1",
272
+ "shuffle_before_assign": false,
273
+ "shuffle_freq": 1,
274
+ "shuffle_group_size": null,
275
+ "shuffle_level": 3,
276
+ "shuffle_with_random_order": true,
277
+ "shuffle_within_dp": false,
278
+ "sigmoid_in_fp32": true,
279
+ "skip_local_shuffle": false,
280
+ "std_margin": 15,
281
+ "std_margin_skip_last": true,
282
+ "std_penalty_coeff": 0.0,
283
+ "top_k": 1,
284
+ "use_fixed_topk": false,
285
+ "use_fixed_topk_bsz": 1,
286
+ "use_fsdp": true,
287
+ "use_shared_expert": true,
288
+ "use_te_in_moe": false,
289
+ "use_token_choice": true,
290
+ "zero_clamp_grads": true,
291
+ "zero_router_grads": false
292
+ },
293
+ "ffn_dim_multiplier": 1.2,
294
+ "ffn_exp": 4.0,
295
+ "ffn_in_dropout": 0,
296
+ "ffn_out_dropout": 0,
297
+ "flex_score_mod": "",
298
+ "fp8_amax_compute_algo": "max",
299
+ "fp8_amax_history_len": 1024,
300
+ "fp8_early_bf16_weight_release": false,
301
+ "fp8_fuse_wgrad_accumulation": false,
302
+ "fp8_grad_output_dynamic_scale": false,
303
+ "fp8_input_dynamic_scale": false,
304
+ "fp8_interval": 1,
305
+ "fp8_margin": 0,
306
+ "fp8_rowwise": false,
307
+ "fp8_wgrad": false,
308
+ "freeze_decoder": false,
309
+ "freeze_patterns": null,
310
+ "freeze_vision_encoder": false,
311
+ "fsdp_checkpoint_wrap_layer_frequency": 1,
312
+ "fsdpv1_flatten_params": true,
313
+ "fsdpv2_cast_root_forward_inputs": false,
314
+ "fsdpv2_cpu_offload_percentage": null,
315
+ "fsdpv2_enable_cpu_offload": false,
316
+ "fsdpv2_use_per_pg_streams": true,
317
+ "fsdpv2_wrap_pp_model_chunk_only": false,
318
+ "fuse_sequence_parallel": true,
319
+ "global_attn_cfg": "all",
320
+ "head_dim": 128,
321
+ "high_freq_factor": 32,
322
+ "hsdp_replicate_num": 1,
323
+ "init": {
324
+ "coeff_std": null,
325
+ "depth_last": false,
326
+ "fixed_std": null,
327
+ "no_init": false,
328
+ "router_coeff_std": 0.1,
329
+ "truncate_std_mult": 2.0,
330
+ "use_depth": "current",
331
+ "use_gaussian": true
332
+ },
333
+ "layer_ckpt": "none",
334
+ "lc_rope_len": 0,
335
+ "lc_rope_prob": 0.0,
336
+ "less_layer_first_pp_stage": 0,
337
+ "less_layer_last_pp_stage": 0,
338
+ "local_attention_window_len": null,
339
+ "loss_parallel": false,
340
+ "max_length": 2048,
341
+ "metap": {
342
+ "base_width": 1024.0,
343
+ "coeff_std": 1.0,
344
+ "m_emb": 1.0,
345
+ "metap_mode": "ntp",
346
+ "tie_router_bulk_coeff_std": false,
347
+ "use_metap": false
348
+ },
349
+ "modalities": {
350
+ "freeze_llm": false,
351
+ "image": {
352
+ "enable_projection": true,
353
+ "encoder_name": "llama4_flash_encoder",
354
+ "encoder_params": null,
355
+ "freeze_vision_encoder": true,
356
+ "image_height": 336,
357
+ "image_width": 336,
358
+ "patch_height": 14,
359
+ "patch_width": 14,
360
+ "ps_ratio": 0.5,
361
+ "recompute_transformer": true,
362
+ "return_intermediate": null,
363
+ "use_cached_embeddings": false,
364
+ "use_dynamic_transform": true,
365
+ "vision_adapter_type": "pixel_shuffle_mlp",
366
+ "vision_encoder_ckpt_path": "/mnt/wsfuse/nextgen_mm/vision_encoders/llama4_flash_encoder_1016_338k",
367
+ "vision_encoding_batch_size": null
368
+ },
369
+ "speech": {
370
+ "append_quantization_output": false,
371
+ "data_format_args": {
372
+ "disallow_text_free_seg": true,
373
+ "emit_text_right_after_sys_start": true,
374
+ "enable_speech_text_hybrid": false,
375
+ "hybrid_generation_mode": "single_token_emit",
376
+ "hybrid_understanding_mode": "streaming",
377
+ "jitter_system_prompt": false,
378
+ "jitter_system_prompt_today_date": false,
379
+ "num_words_in_unit": 1,
380
+ "speech_delay": 0,
381
+ "system_text_lookahead": 0,
382
+ "tool_token_delay_ms_max": 100,
383
+ "tool_token_delay_ms_min": 0,
384
+ "transfer_dates_to_template": false,
385
+ "turn_start_with_white_space": false,
386
+ "user_text_delay": 6
387
+ },
388
+ "discrete_codebooks_size": 8192,
389
+ "enable_aux_user_output": false,
390
+ "enable_full_duplex": false,
391
+ "enable_output": false,
392
+ "encoder_device": "cuda",
393
+ "freeze_speech_encoder": true,
394
+ "is_tokenizer": true,
395
+ "load_tokenizer": false,
396
+ "share_speech_emb": false,
397
+ "speech_encoder": null,
398
+ "speech_encoder_ckpt_dir": null,
399
+ "speech_extend_vocab_size": 0,
400
+ "speech_feature_dim": 320,
401
+ "speech_output_control_format": "",
402
+ "speech_projection_dim": 1536,
403
+ "speech_separate_softmax": false,
404
+ "speech_train_audio_end": false,
405
+ "speech_train_audio_start": false,
406
+ "target_speaker_table_size": 0,
407
+ "use_discrete_codes": false,
408
+ "use_embedding": false,
409
+ "use_fp32_for_speech_output": true,
410
+ "use_fp64": true,
411
+ "use_projection": true,
412
+ "user_embedding_by_concat": false,
413
+ "user_embedding_by_permutation": true,
414
+ "user_projection_use_mlp": false
415
+ },
416
+ "use_image": false,
417
+ "use_speech": false,
418
+ "use_video": false
419
+ },
420
+ "multiple_of": 2048,
421
+ "n_heads": 40,
422
+ "n_kv_heads": 8,
423
+ "n_layers": 3,
424
+ "non_linearity": "swiglu",
425
+ "nope_no_qk_norm": true,
426
+ "norm_affine": true,
427
+ "norm_eps": 1e-05,
428
+ "norm_type": "rmsnorm",
429
+ "num_unfrozen_layers": 0,
430
+ "output_size": 202048,
431
+ "parallel_decoding": {
432
+ "enable_fc_parallelism": true,
433
+ "fc_with_bias": false,
434
+ "first_and_last_norm_required": true,
435
+ "has_parallel_decoding": true,
436
+ "parallel_decoding_type": "EAGLE",
437
+ "share_input_output_embed_with_target": true
438
+ },
439
+ "parallel_output_norm": true,
440
+ "peft_args": null,
441
+ "pp_use_tensor_pool": false,
442
+ "pre_norm": true,
443
+ "prefetch_weight_latency": 1.0,
444
+ "qat_args": null,
445
+ "qk_norm_across_heads": false,
446
+ "qk_norm_affine": false,
447
+ "recompute_attn": true,
448
+ "recompute_fc1_out": true,
449
+ "recompute_fc3_out": true,
450
+ "recompute_q_norm": false,
451
+ "rope_attn_scale": false,
452
+ "rope_scale_factor": 1,
453
+ "rope_theta": 500000.0,
454
+ "rope_use_fp32_in_outer_product": true,
455
+ "sequence_parallel": false,
456
+ "share_emb": false,
457
+ "stochastic_depth_p_attn": 0,
458
+ "stochastic_depth_p_ffn": 0,
459
+ "te_use_fsdp_mixed_precision": true,
460
+ "use_flex_attn": false,
461
+ "use_fp8": false,
462
+ "use_qk_norm": false,
463
+ "use_rope": true,
464
+ "use_scaled_rope": false,
465
+ "use_te_layers": true,
466
+ "vocab_parallel": true,
467
+ "vocab_size": 202048
468
+ },
469
+ "model_parallel_size": 8,
470
+ "model_precheck": false,
471
+ "nan_detector_steps": 0,
472
+ "no_final_ckpt": false,
473
+ "num_layers_per_virtual_pipeline_stage": null,
474
+ "num_microbatches_with_partial_activation_checkpoints": 1,
475
+ "number_of_manifold_servers_per_host": 8,
476
+ "old_mp": -1,
477
+ "old_world_size": -1,
478
+ "optim": {
479
+ "annealing_step": 10000,
480
+ "beta1": 0.9,
481
+ "beta2": 0.95,
482
+ "clip": 1.0,
483
+ "cosine_theta": 1.0,
484
+ "cycle_length": 1.0,
485
+ "decay_length_fraction": 0.1,
486
+ "epsilon": 1e-08,
487
+ "exp_factor": 0.5,
488
+ "fused": null,
489
+ "grad_accumulate_steps": 1,
490
+ "independent_weight_decay": false,
491
+ "lr": 0.0002,
492
+ "lr_min_ratio": 0.1,
493
+ "modality_order": "text,vision,speech,speech_full_duplex",
494
+ "non_nope_lr_mult": null,
495
+ "nope_lr_mult": null,
496
+ "scheduler": "constant",
497
+ "start_annealing_step": -1,
498
+ "use_fp32_copy_optim": true,
499
+ "vision_encoder_lr": null,
500
+ "vision_projection_lr": null,
501
+ "warmup": 0,
502
+ "weight_decay": 0.1
503
+ },
504
+ "optimize_backward_concat": false,
505
+ "overlap_p2p_communication": true,
506
+ "paft": {
507
+ "all_reduce_timeout_grow_ms": 300000,
508
+ "all_reduce_timeout_ms": 60000,
509
+ "ctran_port_base": 18700,
510
+ "enable": false,
511
+ "ib_exchange_port_base": 18600,
512
+ "max_quorum_num_retries": 5,
513
+ "max_step_retries": 5,
514
+ "min_replicas_to_run": null,
515
+ "qp_connect_timeout_ms": null,
516
+ "replica_collective_timeout_s": 600,
517
+ "send_recv_timeout_ms": 5000,
518
+ "startup_sleep_ms": 10000,
519
+ "test_only_barrier_timeout_s": 180,
520
+ "test_only_skip_ftar": false
521
+ },
522
+ "periodic_gpu_check": false,
523
+ "pg_tuning_options_from_yaml": "",
524
+ "pipeline_parallel_microbatch_size": 1,
525
+ "pipeline_parallel_size": 1,
526
+ "pipeline_strategy": "dora-dfs",
527
+ "power_consumer": {
528
+ "enable": false,
529
+ "run_delay_steps": 0,
530
+ "run_duration_steps": 100,
531
+ "run_freq": 1000,
532
+ "run_mode": "periodic"
533
+ },
534
+ "pp_num_warm_up_microbatch_ratio": null,
535
+ "profile_acc_events": false,
536
+ "profile_barrier_timeout_s": 0,
537
+ "profile_freq": -1,
538
+ "profile_num_steps_active": 1,
539
+ "profile_record_shapes": true,
540
+ "profile_with_stack": false,
541
+ "py_spy_args": {
542
+ "active_seconds": 600,
543
+ "format": "flamegraph",
544
+ "freq": -1,
545
+ "rank0_only": true,
546
+ "rate": 50,
547
+ "start_offset": 10
548
+ },
549
+ "recompute_all_mb": false,
550
+ "reshard_after_forward": false,
551
+ "restore_dataloader_position": false,
552
+ "root_dump_dir": "/mnt/wsfuse/outputs/xldumps",
553
+ "runtime_nccl_timeout_s": 600,
554
+ "sample_across_datasets": true,
555
+ "seq_len": 4096,
556
+ "skip_evals_during_training": true,
557
+ "slurm": {
558
+ "global_rank": 0,
559
+ "is_slurm_job": false,
560
+ "role_index": 0,
561
+ "role_rank": 0,
562
+ "role_replica_count": 1,
563
+ "role_world_size": 256,
564
+ "world_size": 256
565
+ },
566
+ "speech_loss": {
567
+ "aux_aligned_text_loss": false,
568
+ "aux_user_loss_weight": 0.9,
569
+ "dual_channel_aux_user_loss_weight": 0,
570
+ "enable": false,
571
+ "force_simulated_sys_loss": true,
572
+ "full_duplex_dual_loss_mode": "sample",
573
+ "kind": "single_softmax",
574
+ "log_logits": false,
575
+ "maybe_tool_token_loss_weight": null,
576
+ "perfect_silence_id": null,
577
+ "speech_loss_weight": null,
578
+ "system_floors_weight": null,
579
+ "system_perfect_silence_weight": 0.0,
580
+ "system_text_escape_audio_weight": null,
581
+ "user_text_escape_audio_weight": null
582
+ },
583
+ "steps": 1000000,
584
+ "text_only_steps": null,
585
+ "tokenizer": {
586
+ "path": "/mnt/wsfuse/tokenizers/tiktoken/l4_200k_base",
587
+ "version": "llama4_tiktoken_v6"
588
+ },
589
+ "tokenizer_dir": "/mnt/wsfuse/tokenizers/tiktoken",
590
+ "torch_seed": 0,
591
+ "unlimited_steps": false,
592
+ "use_sum_loss": false,
593
+ "valid": {
594
+ "batch_size": 32,
595
+ "debug": false,
596
+ "majority_voting": 0,
597
+ "n_batches": 100,
598
+ "ppl_files_str": "",
599
+ "prompt_path": "",
600
+ "random_fewshots": false,
601
+ "seed": 42,
602
+ "seq_len": 2048,
603
+ "skip_sanity_check": false,
604
+ "tasks_root_dir": "",
605
+ "tasks_str": "",
606
+ "temperature": 0.0,
607
+ "top_k": 0,
608
+ "top_p": 0.0,
609
+ "use_sampling": false,
610
+ "write_eval": false
611
+ },
612
+ "z_loss_multiplier": 0.0
613
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "image_processor_type": "Llama4ImageProcessorFast",
17
+ "image_std": [
18
+ 0.5,
19
+ 0.5,
20
+ 0.5
21
+ ],
22
+ "input_data_format": null,
23
+ "max_patches": 16,
24
+ "processor_class": "Llama4Processor",
25
+ "resample": 2,
26
+ "rescale_factor": 0.00392156862745098,
27
+ "resize_to_max_canvas": false,
28
+ "return_tensors": null,
29
+ "size": {
30
+ "height": 336,
31
+ "width": 336
32
+ }
33
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "fake_image_token": "<|image|>",
3
+ "image_token": "<|image|>",
4
+ "patch_size": 14,
5
+ "processor_class": "Llama4Processor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|begin_of_text|>",
3
+ "eos_token": "<|end_of_text|>",
4
+ "pad_token": "<|finetune_right_pad_id|>"
5
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22e009b4fcb58eddbabf347e71b9881ea1e6eb72d44e5ea9477c7587df68fd8d
3
+ size 27948580
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff