| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.136, | |
| "eval_steps": 500, | |
| "global_step": 170, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2975.1640625, | |
| "epoch": 0.0008, | |
| "grad_norm": 0.4355019563597344, | |
| "kl": 0.0, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0713, | |
| "reward": 0.20156250591389835, | |
| "reward_std": 0.3847714839503169, | |
| "rewards/end_of_conversation_reward_func": 0.04531250044237822, | |
| "rewards/end_rm_reward_func": 0.234375, | |
| "rewards/length_reward_func": -0.078125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.004448831474292092, | |
| "epoch": 0.0016, | |
| "grad_norm": 0.43263977335652204, | |
| "kl": 0.0011990070343017578, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0713, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3296.3828125, | |
| "epoch": 0.0024, | |
| "grad_norm": 0.4062162240261375, | |
| "kl": 0.0015230178833007812, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0715, | |
| "reward": 0.13886718894355, | |
| "reward_std": 0.47177961003035307, | |
| "rewards/end_of_conversation_reward_func": 0.04218750080326572, | |
| "rewards/end_rm_reward_func": 0.1669921875, | |
| "rewards/length_reward_func": -0.0703125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.004145426195464097, | |
| "epoch": 0.0032, | |
| "grad_norm": 0.40276269917704577, | |
| "kl": 0.0014390945434570312, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0715, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3100.2265625, | |
| "epoch": 0.004, | |
| "grad_norm": 0.4627427111427237, | |
| "kl": 0.0020265579223632812, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0995, | |
| "reward": 0.08027344156289473, | |
| "reward_std": 0.39663981925696135, | |
| "rewards/end_of_conversation_reward_func": 0.042187501152511686, | |
| "rewards/end_rm_reward_func": 0.1279296875, | |
| "rewards/length_reward_func": -0.08984375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.004560721252346411, | |
| "epoch": 0.0048, | |
| "grad_norm": 0.46529399802489874, | |
| "kl": 0.0024538040161132812, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0994, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3276.453125, | |
| "epoch": 0.0056, | |
| "grad_norm": 0.3877916318690821, | |
| "kl": 0.0020666122436523438, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0582, | |
| "reward": 0.21875000558793545, | |
| "reward_std": 0.4846916552633047, | |
| "rewards/end_of_conversation_reward_func": 0.04296875110594556, | |
| "rewards/end_rm_reward_func": 0.23828125, | |
| "rewards/length_reward_func": -0.0625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.003973798331571743, | |
| "epoch": 0.0064, | |
| "grad_norm": 0.38615742327777886, | |
| "kl": 0.0033469200134277344, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0582, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3526.203125, | |
| "epoch": 0.0072, | |
| "grad_norm": 0.4253042896775168, | |
| "kl": 0.0022199153900146484, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0966, | |
| "reward": 0.1652343834284693, | |
| "reward_std": 0.44294953159987926, | |
| "rewards/end_of_conversation_reward_func": 0.04218750086147338, | |
| "rewards/end_rm_reward_func": 0.208984375, | |
| "rewards/length_reward_func": -0.0859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0041108929144684225, | |
| "epoch": 0.008, | |
| "grad_norm": 0.4043558511957573, | |
| "kl": 0.003947257995605469, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0965, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3034.078125, | |
| "epoch": 0.0088, | |
| "grad_norm": 0.49241796839949487, | |
| "kl": 0.002953052520751953, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1329, | |
| "reward": 0.17500000772997737, | |
| "reward_std": 0.4498265013098717, | |
| "rewards/end_of_conversation_reward_func": 0.0421875009778887, | |
| "rewards/end_rm_reward_func": 0.18359375, | |
| "rewards/length_reward_func": -0.05078125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0041437734180362895, | |
| "epoch": 0.0096, | |
| "grad_norm": 0.4951138738584942, | |
| "kl": 0.0035958290100097656, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1329, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3284.0234375, | |
| "epoch": 0.0104, | |
| "grad_norm": 0.41732249846603825, | |
| "kl": 0.020305156707763672, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0294, | |
| "reward": 0.20390625670552254, | |
| "reward_std": 0.4014717759564519, | |
| "rewards/end_of_conversation_reward_func": 0.047656250884756446, | |
| "rewards/end_rm_reward_func": 0.22265625, | |
| "rewards/length_reward_func": -0.06640625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.00431825453415513, | |
| "epoch": 0.0112, | |
| "grad_norm": 0.4109822678418246, | |
| "kl": 0.030427932739257812, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0295, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2969.1953125, | |
| "epoch": 0.012, | |
| "grad_norm": 0.48270296374119925, | |
| "kl": 0.010631561279296875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0477, | |
| "reward": 0.2396484500495717, | |
| "reward_std": 0.3797372495755553, | |
| "rewards/end_of_conversation_reward_func": 0.04921875154832378, | |
| "rewards/end_rm_reward_func": 0.2255859375, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.004689372595748864, | |
| "epoch": 0.0128, | |
| "grad_norm": 0.4751005673450339, | |
| "kl": 0.011091232299804688, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0476, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2861.1953125, | |
| "epoch": 0.0136, | |
| "grad_norm": 0.5567124142202874, | |
| "kl": 0.014386177062988281, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0303, | |
| "reward": 0.21015625749714673, | |
| "reward_std": 0.4733537929132581, | |
| "rewards/end_of_conversation_reward_func": 0.05781250004656613, | |
| "rewards/end_rm_reward_func": 0.2265625, | |
| "rewards/length_reward_func": -0.07421875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.004998111340682954, | |
| "epoch": 0.0144, | |
| "grad_norm": 0.5465399440176335, | |
| "kl": 0.013627052307128906, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0301, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2829.671875, | |
| "epoch": 0.0152, | |
| "grad_norm": 0.5293451315346432, | |
| "kl": 0.01097869873046875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0218, | |
| "reward": 0.28183594974689186, | |
| "reward_std": 0.49502080073580146, | |
| "rewards/end_of_conversation_reward_func": 0.0523437510128133, | |
| "rewards/end_rm_reward_func": 0.2646484375, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.004524174742982723, | |
| "epoch": 0.016, | |
| "grad_norm": 0.5117856717280709, | |
| "kl": 0.010786056518554688, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0217, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3161.296875, | |
| "epoch": 0.0168, | |
| "grad_norm": 0.41948668078238804, | |
| "kl": 0.008313179016113281, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0421, | |
| "reward": 0.3046874995343387, | |
| "reward_std": 0.40682537760585546, | |
| "rewards/end_of_conversation_reward_func": 0.04296875046566129, | |
| "rewards/end_rm_reward_func": 0.33203125, | |
| "rewards/length_reward_func": -0.0703125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.004059915816469584, | |
| "epoch": 0.0176, | |
| "grad_norm": 0.41427480690422763, | |
| "kl": 0.008981704711914062, | |
| "learning_rate": 5e-07, | |
| "loss": 0.042, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3201.0703125, | |
| "epoch": 0.0184, | |
| "grad_norm": 0.43906528625565505, | |
| "kl": 0.0200042724609375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0944, | |
| "reward": 0.30175782716833055, | |
| "reward_std": 0.40742881037294865, | |
| "rewards/end_of_conversation_reward_func": 0.0546875013387762, | |
| "rewards/end_rm_reward_func": 0.3291015625, | |
| "rewards/length_reward_func": -0.08203125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.004639639038941823, | |
| "epoch": 0.0192, | |
| "grad_norm": 0.43593248124452694, | |
| "kl": 0.022314071655273438, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0943, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3829.46875, | |
| "epoch": 0.02, | |
| "grad_norm": 0.38202405603939055, | |
| "kl": 0.009199142456054688, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1122, | |
| "reward": 0.16308594099245965, | |
| "reward_std": 0.5269097108393908, | |
| "rewards/end_of_conversation_reward_func": 0.04296875064028427, | |
| "rewards/end_rm_reward_func": 0.2138671875, | |
| "rewards/length_reward_func": -0.09375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.003962479371693917, | |
| "epoch": 0.0208, | |
| "grad_norm": 0.3735492080285575, | |
| "kl": 0.010051727294921875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1122, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2547.2109375, | |
| "epoch": 0.0216, | |
| "grad_norm": 0.610378722681955, | |
| "kl": 0.0273590087890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0282, | |
| "reward": 0.27187501452863216, | |
| "reward_std": 0.4403302203863859, | |
| "rewards/end_of_conversation_reward_func": 0.05703125102445483, | |
| "rewards/end_rm_reward_func": 0.25, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.005169034717255272, | |
| "epoch": 0.0224, | |
| "grad_norm": 0.6320276760824112, | |
| "kl": 0.0298004150390625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.028, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3147.9140625, | |
| "epoch": 0.0232, | |
| "grad_norm": 0.48988346522636234, | |
| "kl": 0.018934249877929688, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1053, | |
| "reward": 0.14648437302093953, | |
| "reward_std": 0.3592515978962183, | |
| "rewards/end_of_conversation_reward_func": 0.03906250069849193, | |
| "rewards/end_rm_reward_func": 0.166015625, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0043003826576750726, | |
| "epoch": 0.024, | |
| "grad_norm": 0.4903022425159312, | |
| "kl": 0.022472381591796875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1051, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3250.6953125, | |
| "epoch": 0.0248, | |
| "grad_norm": 0.4722499061352693, | |
| "kl": 0.0226593017578125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.09, | |
| "reward": 0.27050781436264515, | |
| "reward_std": 0.5021275784820318, | |
| "rewards/end_of_conversation_reward_func": 0.04687500075669959, | |
| "rewards/end_rm_reward_func": 0.2900390625, | |
| "rewards/length_reward_func": -0.06640625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.004512525454629213, | |
| "epoch": 0.0256, | |
| "grad_norm": 0.47562899020814536, | |
| "kl": 0.023143768310546875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.09, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2877.4140625, | |
| "epoch": 0.0264, | |
| "grad_norm": 0.5889200019558475, | |
| "kl": 0.029821395874023438, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0587, | |
| "reward": 0.21738281490979716, | |
| "reward_std": 0.43986387038603425, | |
| "rewards/end_of_conversation_reward_func": 0.048437500605359674, | |
| "rewards/end_rm_reward_func": 0.2275390625, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.004839819855988026, | |
| "epoch": 0.0272, | |
| "grad_norm": 0.5773456233238853, | |
| "kl": 0.04634857177734375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0584, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3019.0078125, | |
| "epoch": 0.028, | |
| "grad_norm": 0.5859980971392886, | |
| "kl": 0.05249786376953125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0516, | |
| "reward": 0.16308594855945557, | |
| "reward_std": 0.4561401130631566, | |
| "rewards/end_of_conversation_reward_func": 0.046875000989530236, | |
| "rewards/end_rm_reward_func": 0.1904296875, | |
| "rewards/length_reward_func": -0.07421875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.005565648170886561, | |
| "epoch": 0.0288, | |
| "grad_norm": 0.5723337880881523, | |
| "kl": 0.07667350769042969, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0514, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3228.2421875, | |
| "epoch": 0.0296, | |
| "grad_norm": 0.44518950059783957, | |
| "kl": 0.0410919189453125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0187, | |
| "reward": 0.13417969300644472, | |
| "reward_std": 0.40079194540157914, | |
| "rewards/end_of_conversation_reward_func": 0.04531250096624717, | |
| "rewards/end_rm_reward_func": 0.1240234375, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.003970242956711445, | |
| "epoch": 0.0304, | |
| "grad_norm": 0.4361401996932686, | |
| "kl": 0.03826141357421875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0186, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3131.0546875, | |
| "epoch": 0.0312, | |
| "grad_norm": 0.47844256212930125, | |
| "kl": 0.03959083557128906, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0714, | |
| "reward": 0.19863281957805157, | |
| "reward_std": 0.46341709420084953, | |
| "rewards/end_of_conversation_reward_func": 0.04140625090803951, | |
| "rewards/end_rm_reward_func": 0.2041015625, | |
| "rewards/length_reward_func": -0.046875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.004569632888888009, | |
| "epoch": 0.032, | |
| "grad_norm": 0.4703050101042975, | |
| "kl": 0.033267974853515625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0714, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3917.1875, | |
| "epoch": 0.0328, | |
| "grad_norm": 0.3673788800178079, | |
| "kl": 0.046047210693359375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0997, | |
| "reward": 0.18164063431322575, | |
| "reward_std": 0.4014846673235297, | |
| "rewards/end_of_conversation_reward_func": 0.042968751047737896, | |
| "rewards/end_rm_reward_func": 0.220703125, | |
| "rewards/length_reward_func": -0.08203125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0038312822725856677, | |
| "epoch": 0.0336, | |
| "grad_norm": 0.3586932712636358, | |
| "kl": 0.0465850830078125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0996, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3266.2734375, | |
| "epoch": 0.0344, | |
| "grad_norm": 0.5611692724392116, | |
| "kl": 0.0754852294921875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0169, | |
| "reward": 0.026171877863816917, | |
| "reward_std": 0.3950889599509537, | |
| "rewards/end_of_conversation_reward_func": 0.03593750059371814, | |
| "rewards/end_rm_reward_func": 0.064453125, | |
| "rewards/length_reward_func": -0.07421875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.004459991134353913, | |
| "epoch": 0.0352, | |
| "grad_norm": 0.650232454614409, | |
| "kl": 0.05011749267578125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0168, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3211.5859375, | |
| "epoch": 0.036, | |
| "grad_norm": 0.44021307945681076, | |
| "kl": 0.09033966064453125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0972, | |
| "reward": 0.2980468822643161, | |
| "reward_std": 0.4539717771112919, | |
| "rewards/end_of_conversation_reward_func": 0.0421875006868504, | |
| "rewards/end_rm_reward_func": 0.314453125, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.004655172117054462, | |
| "epoch": 0.0368, | |
| "grad_norm": 0.432964397523799, | |
| "kl": 0.10763931274414062, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0971, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2954.6171875, | |
| "epoch": 0.0376, | |
| "grad_norm": 0.506733600693208, | |
| "kl": 0.080352783203125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1313, | |
| "reward": 0.1996093881316483, | |
| "reward_std": 0.4217355151195079, | |
| "rewards/end_of_conversation_reward_func": 0.04921875084983185, | |
| "rewards/end_rm_reward_func": 0.224609375, | |
| "rewards/length_reward_func": -0.07421875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.00481395295355469, | |
| "epoch": 0.0384, | |
| "grad_norm": 0.4925165842097727, | |
| "kl": 0.09535980224609375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1311, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3474.7890625, | |
| "epoch": 0.0392, | |
| "grad_norm": 0.4471083539693922, | |
| "kl": 0.10435867309570312, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1437, | |
| "reward": 0.32910157029982656, | |
| "reward_std": 0.508234778419137, | |
| "rewards/end_of_conversation_reward_func": 0.046875000873114914, | |
| "rewards/end_rm_reward_func": 0.3525390625, | |
| "rewards/length_reward_func": -0.0703125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0046363398869289085, | |
| "epoch": 0.04, | |
| "grad_norm": 0.43209286829846705, | |
| "kl": 0.17406082153320312, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1436, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2888.265625, | |
| "epoch": 0.0408, | |
| "grad_norm": 0.5175016713586391, | |
| "kl": 0.17282867431640625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0991, | |
| "reward": 0.27207032637670636, | |
| "reward_std": 0.4736290629953146, | |
| "rewards/end_of_conversation_reward_func": 0.052343750779982656, | |
| "rewards/end_rm_reward_func": 0.2705078125, | |
| "rewards/length_reward_func": -0.05078125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.005186622860492207, | |
| "epoch": 0.0416, | |
| "grad_norm": 0.5022931530008493, | |
| "kl": 0.64031982421875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0989, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2426.1796875, | |
| "epoch": 0.0424, | |
| "grad_norm": 0.6255369968307551, | |
| "kl": 0.22528839111328125, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0125, | |
| "reward": 0.3054687652620487, | |
| "reward_std": 0.3840066557750106, | |
| "rewards/end_of_conversation_reward_func": 0.06328125123400241, | |
| "rewards/end_rm_reward_func": 0.2734375, | |
| "rewards/length_reward_func": -0.03125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.00623530795564875, | |
| "epoch": 0.0432, | |
| "grad_norm": 0.6007417569260441, | |
| "kl": 0.28375244140625, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0127, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3665.03125, | |
| "epoch": 0.044, | |
| "grad_norm": 0.44582887064322585, | |
| "kl": 0.31996917724609375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0533, | |
| "reward": 0.15429688058793545, | |
| "reward_std": 0.42791955079883337, | |
| "rewards/end_of_conversation_reward_func": 0.046875000989530236, | |
| "rewards/end_rm_reward_func": 0.189453125, | |
| "rewards/length_reward_func": -0.08203125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.004688024055212736, | |
| "epoch": 0.0448, | |
| "grad_norm": 0.4305811562261461, | |
| "kl": 0.25885772705078125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0533, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3248.5, | |
| "epoch": 0.0456, | |
| "grad_norm": 0.4802130548748173, | |
| "kl": 1.8993988037109375, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0111, | |
| "reward": 0.23535156849538907, | |
| "reward_std": 0.49618958681821823, | |
| "rewards/end_of_conversation_reward_func": 0.05078125128056854, | |
| "rewards/end_rm_reward_func": 0.2275390625, | |
| "rewards/length_reward_func": -0.04296875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.005161113862413913, | |
| "epoch": 0.0464, | |
| "grad_norm": 0.4715970994501386, | |
| "kl": 2.4185791015625, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0113, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3355.9921875, | |
| "epoch": 0.0472, | |
| "grad_norm": 0.4817763963120568, | |
| "kl": 3.2315750122070312, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1374, | |
| "reward": 0.1996093873749487, | |
| "reward_std": 0.39307427662424743, | |
| "rewards/end_of_conversation_reward_func": 0.04921875096624717, | |
| "rewards/end_rm_reward_func": 0.212890625, | |
| "rewards/length_reward_func": -0.0625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.004419020988279954, | |
| "epoch": 0.048, | |
| "grad_norm": 0.4663422707726828, | |
| "kl": 1.4642868041992188, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1374, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2897.2890625, | |
| "epoch": 0.0488, | |
| "grad_norm": 0.588164762943858, | |
| "kl": 0.23455047607421875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0579, | |
| "reward": 0.2761718816473149, | |
| "reward_std": 0.4508576055523008, | |
| "rewards/end_of_conversation_reward_func": 0.051562500884756446, | |
| "rewards/end_rm_reward_func": 0.251953125, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.005412581740529276, | |
| "epoch": 0.0496, | |
| "grad_norm": 0.5649279212366, | |
| "kl": 0.44388580322265625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0579, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2572.625, | |
| "epoch": 0.0504, | |
| "grad_norm": 0.7907651458985474, | |
| "kl": 38.2191162109375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0775, | |
| "reward": 0.24921876098960638, | |
| "reward_std": 0.4261420601978898, | |
| "rewards/end_of_conversation_reward_func": 0.05781250057043508, | |
| "rewards/end_rm_reward_func": 0.22265625, | |
| "rewards/length_reward_func": -0.03125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.005620055002509616, | |
| "epoch": 0.0512, | |
| "grad_norm": 0.6418001802934817, | |
| "kl": 15.695816040039062, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0772, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2276.734375, | |
| "epoch": 0.052, | |
| "grad_norm": 0.665222722206486, | |
| "kl": 0.13232421875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0409, | |
| "reward": 0.40312501695007086, | |
| "reward_std": 0.4724201774224639, | |
| "rewards/end_of_conversation_reward_func": 0.06718750135041773, | |
| "rewards/end_rm_reward_func": 0.36328125, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.005610677297227085, | |
| "epoch": 0.0528, | |
| "grad_norm": 0.5939092022405507, | |
| "kl": 0.1356201171875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0405, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2571.578125, | |
| "epoch": 0.0536, | |
| "grad_norm": 0.5313848515306442, | |
| "kl": 0.2240753173828125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1096, | |
| "reward": 0.3613281352445483, | |
| "reward_std": 0.5025414768606424, | |
| "rewards/end_of_conversation_reward_func": 0.05859375069849193, | |
| "rewards/end_rm_reward_func": 0.349609375, | |
| "rewards/length_reward_func": -0.046875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.005505270150024444, | |
| "epoch": 0.0544, | |
| "grad_norm": 0.5239589241494494, | |
| "kl": 0.351348876953125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1093, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2831.0625, | |
| "epoch": 0.0552, | |
| "grad_norm": 0.5445532025053365, | |
| "kl": 0.248504638671875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0613, | |
| "reward": 0.22050781839061528, | |
| "reward_std": 0.4537838753312826, | |
| "rewards/end_of_conversation_reward_func": 0.05156250129221007, | |
| "rewards/end_rm_reward_func": 0.2275390625, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.00578475151269231, | |
| "epoch": 0.056, | |
| "grad_norm": 0.5324428626427173, | |
| "kl": 0.200439453125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0611, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2407.078125, | |
| "epoch": 0.0568, | |
| "grad_norm": 0.7082243961454275, | |
| "kl": 0.5781097412109375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0656, | |
| "reward": 0.44589844811707735, | |
| "reward_std": 0.4496405338868499, | |
| "rewards/end_of_conversation_reward_func": 0.0640625013038516, | |
| "rewards/end_rm_reward_func": 0.4091796875, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.006228098354768008, | |
| "epoch": 0.0576, | |
| "grad_norm": 0.6279502761796933, | |
| "kl": 0.60986328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0654, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2754.171875, | |
| "epoch": 0.0584, | |
| "grad_norm": 0.6511514863553758, | |
| "kl": 0.6928863525390625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0478, | |
| "reward": 0.26621094363508746, | |
| "reward_std": 0.3944235248491168, | |
| "rewards/end_of_conversation_reward_func": 0.06015625048894435, | |
| "rewards/end_rm_reward_func": 0.2451171875, | |
| "rewards/length_reward_func": -0.0390625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.005119593348354101, | |
| "epoch": 0.0592, | |
| "grad_norm": 0.5405583476661738, | |
| "kl": 0.45965576171875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0477, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2714.515625, | |
| "epoch": 0.06, | |
| "grad_norm": 0.5541156231836364, | |
| "kl": 1.046234130859375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0652, | |
| "reward": 0.26230468694120646, | |
| "reward_std": 0.4206010536290705, | |
| "rewards/end_of_conversation_reward_func": 0.05234375054715201, | |
| "rewards/end_rm_reward_func": 0.2373046875, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.005520118284039199, | |
| "epoch": 0.0608, | |
| "grad_norm": 0.5198654147754546, | |
| "kl": 0.9281158447265625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0651, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2761.2578125, | |
| "epoch": 0.0616, | |
| "grad_norm": 0.5969632201148187, | |
| "kl": 0.617431640625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0339, | |
| "reward": 0.30253906978759915, | |
| "reward_std": 0.42008747160434723, | |
| "rewards/end_of_conversation_reward_func": 0.059375002048909664, | |
| "rewards/end_rm_reward_func": 0.2744140625, | |
| "rewards/length_reward_func": -0.03125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.005989336816128343, | |
| "epoch": 0.0624, | |
| "grad_norm": 0.5878458096207411, | |
| "kl": 0.732513427734375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0337, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2543.9453125, | |
| "epoch": 0.0632, | |
| "grad_norm": 1.0199433093137067, | |
| "kl": 1.05322265625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0652, | |
| "reward": 0.23144532518927008, | |
| "reward_std": 0.4124359574634582, | |
| "rewards/end_of_conversation_reward_func": 0.06640625139698386, | |
| "rewards/end_rm_reward_func": 0.2236328125, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.005892957837204449, | |
| "epoch": 0.064, | |
| "grad_norm": 0.9103349241587038, | |
| "kl": 2.14288330078125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0649, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3057.0546875, | |
| "epoch": 0.0648, | |
| "grad_norm": 1.4360222564196798, | |
| "kl": 1.4345703125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0872, | |
| "reward": 0.21894532290752977, | |
| "reward_std": 0.44305523252114654, | |
| "rewards/end_of_conversation_reward_func": 0.053906250395812094, | |
| "rewards/end_rm_reward_func": 0.2197265625, | |
| "rewards/length_reward_func": -0.0546875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.005477508588228375, | |
| "epoch": 0.0656, | |
| "grad_norm": 1.0247564904674444, | |
| "kl": 1.27581787109375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0865, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2772.40625, | |
| "epoch": 0.0664, | |
| "grad_norm": 0.7761581493328232, | |
| "kl": 3.468292236328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0878, | |
| "reward": 0.260742200887762, | |
| "reward_std": 0.4183371504768729, | |
| "rewards/end_of_conversation_reward_func": 0.05468750174622983, | |
| "rewards/end_rm_reward_func": 0.2646484375, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.006108275701990351, | |
| "epoch": 0.0672, | |
| "grad_norm": 0.6445774473316636, | |
| "kl": 3.2357177734375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0874, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2965.75, | |
| "epoch": 0.068, | |
| "grad_norm": 0.5452316944081127, | |
| "kl": 1.18096923828125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0644, | |
| "reward": 0.19003907265141606, | |
| "reward_std": 0.41479692701250315, | |
| "rewards/end_of_conversation_reward_func": 0.05625000095460564, | |
| "rewards/end_rm_reward_func": 0.1767578125, | |
| "rewards/length_reward_func": -0.04296875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.005549885943764821, | |
| "epoch": 0.0688, | |
| "grad_norm": 0.5370859324762662, | |
| "kl": 1.083465576171875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0644, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2448.203125, | |
| "epoch": 0.0696, | |
| "grad_norm": 0.665995849204049, | |
| "kl": 0.88134765625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0717, | |
| "reward": 0.37382814288139343, | |
| "reward_std": 0.43495669635012746, | |
| "rewards/end_of_conversation_reward_func": 0.05937500041909516, | |
| "rewards/end_rm_reward_func": 0.330078125, | |
| "rewards/length_reward_func": -0.015625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.006831952749053016, | |
| "epoch": 0.0704, | |
| "grad_norm": 0.6138460891472821, | |
| "kl": 0.7757568359375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0715, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2844.96875, | |
| "epoch": 0.0712, | |
| "grad_norm": 39.91934775938643, | |
| "kl": 7.2490234375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0724, | |
| "reward": 0.2925781449303031, | |
| "reward_std": 0.37382530118338764, | |
| "rewards/end_of_conversation_reward_func": 0.06015625153668225, | |
| "rewards/end_rm_reward_func": 0.279296875, | |
| "rewards/length_reward_func": -0.046875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.005283550621243194, | |
| "epoch": 0.072, | |
| "grad_norm": 2.874374713759721, | |
| "kl": 2.914276123046875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0605, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2769.65625, | |
| "epoch": 0.0728, | |
| "grad_norm": 0.7216272604731127, | |
| "kl": 15.577178955078125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0653, | |
| "reward": 0.21445313305594027, | |
| "reward_std": 0.3568238094449043, | |
| "rewards/end_of_conversation_reward_func": 0.056250000779982656, | |
| "rewards/end_rm_reward_func": 0.201171875, | |
| "rewards/length_reward_func": -0.04296875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.006356269208481535, | |
| "epoch": 0.0736, | |
| "grad_norm": 0.657422156714835, | |
| "kl": 3.359100341796875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0652, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2521.03125, | |
| "epoch": 0.0744, | |
| "grad_norm": 0.5916805123057853, | |
| "kl": 6.11346435546875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0434, | |
| "reward": 0.3089843865018338, | |
| "reward_std": 0.42485503386706114, | |
| "rewards/end_of_conversation_reward_func": 0.05703125149011612, | |
| "rewards/end_rm_reward_func": 0.275390625, | |
| "rewards/length_reward_func": -0.0234375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.00618546042824164, | |
| "epoch": 0.0752, | |
| "grad_norm": 0.5823172792796925, | |
| "kl": 7.6622314453125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0433, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2846.6015625, | |
| "epoch": 0.076, | |
| "grad_norm": 7.8688189093488035, | |
| "kl": 11.725372314453125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0675, | |
| "reward": 0.2693359488621354, | |
| "reward_std": 0.41367775388062, | |
| "rewards/end_of_conversation_reward_func": 0.05937500059371814, | |
| "rewards/end_rm_reward_func": 0.2724609375, | |
| "rewards/length_reward_func": -0.0625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.005790801937109791, | |
| "epoch": 0.0768, | |
| "grad_norm": 3.368525315890326, | |
| "kl": 13.3575439453125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0631, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2476.9765625, | |
| "epoch": 0.0776, | |
| "grad_norm": 0.7030982161718689, | |
| "kl": 1.7344970703125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0428, | |
| "reward": 0.21621094946749508, | |
| "reward_std": 0.35986063024029136, | |
| "rewards/end_of_conversation_reward_func": 0.07265625102445483, | |
| "rewards/end_rm_reward_func": 0.1708984375, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0070846437884029, | |
| "epoch": 0.0784, | |
| "grad_norm": 0.7129163912479551, | |
| "kl": 1.77899169921875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0427, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2593.5234375, | |
| "epoch": 0.0792, | |
| "grad_norm": 0.7859235708182953, | |
| "kl": 2.969696044921875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1376, | |
| "reward": 0.32734376145526767, | |
| "reward_std": 0.288719222182408, | |
| "rewards/end_of_conversation_reward_func": 0.06171875155996531, | |
| "rewards/end_rm_reward_func": 0.30859375, | |
| "rewards/length_reward_func": -0.04296875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0061235044413479045, | |
| "epoch": 0.08, | |
| "grad_norm": 0.6828444160836236, | |
| "kl": 2.0836181640625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1373, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2805.28125, | |
| "epoch": 0.0808, | |
| "grad_norm": 0.5849174504978679, | |
| "kl": 1.0841064453125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0062, | |
| "reward": 0.21914063394069672, | |
| "reward_std": 0.43392230197787285, | |
| "rewards/end_of_conversation_reward_func": 0.05312500067520887, | |
| "rewards/end_rm_reward_func": 0.189453125, | |
| "rewards/length_reward_func": -0.0234375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0056501738872611895, | |
| "epoch": 0.0816, | |
| "grad_norm": 0.5867282574139662, | |
| "kl": 0.767822265625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.006, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2540.4609375, | |
| "epoch": 0.0824, | |
| "grad_norm": 0.6250029445368647, | |
| "kl": 2.111724853515625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0241, | |
| "reward": 0.29882813175208867, | |
| "reward_std": 0.4388907542452216, | |
| "rewards/end_of_conversation_reward_func": 0.06250000087311491, | |
| "rewards/end_rm_reward_func": 0.271484375, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.005285024351906031, | |
| "epoch": 0.0832, | |
| "grad_norm": 0.6215749933523446, | |
| "kl": 2.1219482421875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0238, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3044.0, | |
| "epoch": 0.084, | |
| "grad_norm": 0.6597779710468328, | |
| "kl": 6.86090087890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1377, | |
| "reward": 0.13828126178123057, | |
| "reward_std": 0.43175367498770356, | |
| "rewards/end_of_conversation_reward_func": 0.056250001303851604, | |
| "rewards/end_rm_reward_func": 0.140625, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.005206686502788216, | |
| "epoch": 0.0848, | |
| "grad_norm": 0.5942518563284648, | |
| "kl": 2.425811767578125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1374, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2281.328125, | |
| "epoch": 0.0856, | |
| "grad_norm": 0.7014488445783061, | |
| "kl": 0.6387939453125, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0217, | |
| "reward": 0.2966797042172402, | |
| "reward_std": 0.4203540254384279, | |
| "rewards/end_of_conversation_reward_func": 0.059375001466833055, | |
| "rewards/end_rm_reward_func": 0.2685546875, | |
| "rewards/length_reward_func": -0.03125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.005974609957775101, | |
| "epoch": 0.0864, | |
| "grad_norm": 0.6586072574619627, | |
| "kl": 0.62811279296875, | |
| "learning_rate": 5e-07, | |
| "loss": -0.022, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2565.8515625, | |
| "epoch": 0.0872, | |
| "grad_norm": 0.644914441904026, | |
| "kl": 6.9007568359375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0398, | |
| "reward": 0.3605468962341547, | |
| "reward_std": 0.3907069666311145, | |
| "rewards/end_of_conversation_reward_func": 0.0617187509778887, | |
| "rewards/end_rm_reward_func": 0.345703125, | |
| "rewards/length_reward_func": -0.046875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.005732855395763181, | |
| "epoch": 0.088, | |
| "grad_norm": 0.6042885460634405, | |
| "kl": 14.75518798828125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0394, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2593.7109375, | |
| "epoch": 0.0888, | |
| "grad_norm": 0.7580136412646703, | |
| "kl": 194.23867797851562, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0145, | |
| "reward": 0.3703125170432031, | |
| "reward_std": 0.4818801307119429, | |
| "rewards/end_of_conversation_reward_func": 0.06953125121071935, | |
| "rewards/end_rm_reward_func": 0.33984375, | |
| "rewards/length_reward_func": -0.0390625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0058185750094708055, | |
| "epoch": 0.0896, | |
| "grad_norm": 0.707059333498124, | |
| "kl": 93.26089477539062, | |
| "learning_rate": 5e-07, | |
| "loss": 0.014, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2752.3046875, | |
| "epoch": 0.0904, | |
| "grad_norm": 1.2282513544713536, | |
| "kl": 282.3605651855469, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0203, | |
| "reward": 0.22753907297737896, | |
| "reward_std": 0.3849699907004833, | |
| "rewards/end_of_conversation_reward_func": 0.06250000139698386, | |
| "rewards/end_rm_reward_func": 0.2158203125, | |
| "rewards/length_reward_func": -0.05078125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.006134674651548266, | |
| "epoch": 0.0912, | |
| "grad_norm": 0.659663811454641, | |
| "kl": 1257.621337890625, | |
| "learning_rate": 5e-07, | |
| "loss": -0.021, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2639.4375, | |
| "epoch": 0.092, | |
| "grad_norm": 0.7161070250692486, | |
| "kl": 3.4075927734375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0562, | |
| "reward": 0.279296881868504, | |
| "reward_std": 0.36566486582159996, | |
| "rewards/end_of_conversation_reward_func": 0.06250000069849193, | |
| "rewards/end_rm_reward_func": 0.244140625, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.005702364520402625, | |
| "epoch": 0.0928, | |
| "grad_norm": 0.7521060467303439, | |
| "kl": 2.76971435546875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.056, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2755.171875, | |
| "epoch": 0.0936, | |
| "grad_norm": 0.5773097594300078, | |
| "kl": 1.43548583984375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0308, | |
| "reward": 0.28496095119044185, | |
| "reward_std": 0.44529066514223814, | |
| "rewards/end_of_conversation_reward_func": 0.07109375135041773, | |
| "rewards/end_rm_reward_func": 0.2412109375, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.00540813866246026, | |
| "epoch": 0.0944, | |
| "grad_norm": 0.5453752455228841, | |
| "kl": 1.4910888671875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0306, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2547.6640625, | |
| "epoch": 0.0952, | |
| "grad_norm": 0.7741674220919301, | |
| "kl": 3.27825927734375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0425, | |
| "reward": 0.3609375129453838, | |
| "reward_std": 0.5046179071068764, | |
| "rewards/end_of_conversation_reward_func": 0.05625000031432137, | |
| "rewards/end_rm_reward_func": 0.3515625, | |
| "rewards/length_reward_func": -0.046875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.005344324657926336, | |
| "epoch": 0.096, | |
| "grad_norm": 0.6695041005132681, | |
| "kl": 3.209014892578125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0421, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2054.7734375, | |
| "epoch": 0.0968, | |
| "grad_norm": 0.6599312097182909, | |
| "kl": 1.108734130859375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0745, | |
| "reward": 0.48066408233717084, | |
| "reward_std": 0.41660537058487535, | |
| "rewards/end_of_conversation_reward_func": 0.06562500132713467, | |
| "rewards/end_rm_reward_func": 0.4501953125, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.005748817842686549, | |
| "epoch": 0.0976, | |
| "grad_norm": 0.6350464464579841, | |
| "kl": 1.254302978515625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0742, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2930.9453125, | |
| "epoch": 0.0984, | |
| "grad_norm": 0.5357408536545809, | |
| "kl": 1.457122802734375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0746, | |
| "reward": 0.3414062693482265, | |
| "reward_std": 0.4845976228825748, | |
| "rewards/end_of_conversation_reward_func": 0.056250001420266926, | |
| "rewards/end_rm_reward_func": 0.3125, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0055590872943867, | |
| "epoch": 0.0992, | |
| "grad_norm": 0.48529125845684484, | |
| "kl": 3.254119873046875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0744, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2267.65625, | |
| "epoch": 0.1, | |
| "grad_norm": 0.6535834008944279, | |
| "kl": 1.0345458984375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.052, | |
| "reward": 0.2982421967899427, | |
| "reward_std": 0.3505426752381027, | |
| "rewards/end_of_conversation_reward_func": 0.05703125079162419, | |
| "rewards/end_rm_reward_func": 0.2607421875, | |
| "rewards/length_reward_func": -0.01953125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.005725323280785233, | |
| "epoch": 0.1008, | |
| "grad_norm": 0.6302935877001168, | |
| "kl": 1.271240234375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0516, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2597.3203125, | |
| "epoch": 0.1016, | |
| "grad_norm": 0.6427620110094884, | |
| "kl": 2.03521728515625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0256, | |
| "reward": 0.3076172007713467, | |
| "reward_std": 0.35239528538659215, | |
| "rewards/end_of_conversation_reward_func": 0.05859375052386895, | |
| "rewards/end_rm_reward_func": 0.2724609375, | |
| "rewards/length_reward_func": -0.0234375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.006149623615783639, | |
| "epoch": 0.1024, | |
| "grad_norm": 0.6579464818780976, | |
| "kl": 2.0386962890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0253, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2802.3125, | |
| "epoch": 0.1032, | |
| "grad_norm": 0.8734444908028882, | |
| "kl": 17.40386962890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1071, | |
| "reward": 0.33613283321028575, | |
| "reward_std": 0.38626000890508294, | |
| "rewards/end_of_conversation_reward_func": 0.06953125214204192, | |
| "rewards/end_rm_reward_func": 0.2978515625, | |
| "rewards/length_reward_func": -0.03125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.00546567996207159, | |
| "epoch": 0.104, | |
| "grad_norm": 0.7033969362455426, | |
| "kl": 12.68121337890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1066, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3074.7109375, | |
| "epoch": 0.1048, | |
| "grad_norm": 1.9983054128585318, | |
| "kl": 80.3271484375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0574, | |
| "reward": 0.2843750170432031, | |
| "reward_std": 0.5465462752617896, | |
| "rewards/end_of_conversation_reward_func": 0.061718751094304025, | |
| "rewards/end_rm_reward_func": 0.28125, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.005686903212335892, | |
| "epoch": 0.1056, | |
| "grad_norm": 0.7124750782840086, | |
| "kl": 70.491943359375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.056, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2400.2265625, | |
| "epoch": 0.1064, | |
| "grad_norm": 0.9314176839191374, | |
| "kl": 4.3787841796875, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0058, | |
| "reward": 0.3619140745140612, | |
| "reward_std": 0.41163346637040377, | |
| "rewards/end_of_conversation_reward_func": 0.06015625118743628, | |
| "rewards/end_rm_reward_func": 0.3603515625, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.006425623796530999, | |
| "epoch": 0.1072, | |
| "grad_norm": 0.6428707587124971, | |
| "kl": 5.10992431640625, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0064, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2813.4140625, | |
| "epoch": 0.108, | |
| "grad_norm": 1.2731691909842648, | |
| "kl": 7265.392883300781, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0596, | |
| "reward": 0.2429687607800588, | |
| "reward_std": 0.37734482274390757, | |
| "rewards/end_of_conversation_reward_func": 0.06328125105937943, | |
| "rewards/end_rm_reward_func": 0.22265625, | |
| "rewards/length_reward_func": -0.04296875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.006233511230675504, | |
| "epoch": 0.1088, | |
| "grad_norm": 0.7280723173732075, | |
| "kl": 596.9638061523438, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0587, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2861.9296875, | |
| "epoch": 0.1096, | |
| "grad_norm": 0.5765466847825812, | |
| "kl": 1.0167236328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0295, | |
| "reward": 0.2761718863621354, | |
| "reward_std": 0.40949083073064685, | |
| "rewards/end_of_conversation_reward_func": 0.05937500135041773, | |
| "rewards/end_rm_reward_func": 0.255859375, | |
| "rewards/length_reward_func": -0.0390625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.004913579861749895, | |
| "epoch": 0.1104, | |
| "grad_norm": 0.561800571593937, | |
| "kl": 0.811767578125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0293, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2359.171875, | |
| "epoch": 0.1112, | |
| "grad_norm": 10.321113205088718, | |
| "kl": 6.01751708984375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0651, | |
| "reward": 0.24648438091389835, | |
| "reward_std": 0.3026078275870532, | |
| "rewards/end_of_conversation_reward_func": 0.06875000149011612, | |
| "rewards/end_rm_reward_func": 0.193359375, | |
| "rewards/length_reward_func": -0.015625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.006259943009354174, | |
| "epoch": 0.112, | |
| "grad_norm": 1.450404363065613, | |
| "kl": 1.487548828125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0588, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2657.4375, | |
| "epoch": 0.1128, | |
| "grad_norm": 1.6913748735699703, | |
| "kl": 1.611328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0549, | |
| "reward": 0.4406250237952918, | |
| "reward_std": 0.38095375150442123, | |
| "rewards/end_of_conversation_reward_func": 0.061718751734588295, | |
| "rewards/end_rm_reward_func": 0.41796875, | |
| "rewards/length_reward_func": -0.0390625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.005613551795249805, | |
| "epoch": 0.1136, | |
| "grad_norm": 0.7958482694330619, | |
| "kl": 1.34564208984375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.054, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2331.3984375, | |
| "epoch": 0.1144, | |
| "grad_norm": 0.6690401528004118, | |
| "kl": 0.93682861328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0346, | |
| "reward": 0.3912109505617991, | |
| "reward_std": 0.4049599929712713, | |
| "rewards/end_of_conversation_reward_func": 0.06406250066356733, | |
| "rewards/end_rm_reward_func": 0.3466796875, | |
| "rewards/length_reward_func": -0.01953125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0052296058856882155, | |
| "epoch": 0.1152, | |
| "grad_norm": 0.6545569167278118, | |
| "kl": 0.91241455078125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0342, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2549.3515625, | |
| "epoch": 0.116, | |
| "grad_norm": 0.5890775780320036, | |
| "kl": 26.97711181640625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0389, | |
| "reward": 0.23417969699949026, | |
| "reward_std": 0.3516567766200751, | |
| "rewards/end_of_conversation_reward_func": 0.06718750100117177, | |
| "rewards/end_rm_reward_func": 0.1748046875, | |
| "rewards/length_reward_func": -0.0078125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.005101009825011715, | |
| "epoch": 0.1168, | |
| "grad_norm": 0.5467437980921307, | |
| "kl": 70.5947265625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0386, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2663.859375, | |
| "epoch": 0.1176, | |
| "grad_norm": 0.6762584056701121, | |
| "kl": 47.773193359375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0684, | |
| "reward": 0.4140625186264515, | |
| "reward_std": 0.5097206123173237, | |
| "rewards/end_of_conversation_reward_func": 0.06640625069849193, | |
| "rewards/end_rm_reward_func": 0.3828125, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.005189290648559108, | |
| "epoch": 0.1184, | |
| "grad_norm": 0.623836517060842, | |
| "kl": 270.5855712890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0679, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2959.1171875, | |
| "epoch": 0.1192, | |
| "grad_norm": 0.6671454736198891, | |
| "kl": 0.91912841796875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0542, | |
| "reward": 0.34667970472946763, | |
| "reward_std": 0.3730495397467166, | |
| "rewards/end_of_conversation_reward_func": 0.06250000081490725, | |
| "rewards/end_rm_reward_func": 0.3193359375, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.005951303654001094, | |
| "epoch": 0.12, | |
| "grad_norm": 0.6116887062182381, | |
| "kl": 0.791259765625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0538, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3094.140625, | |
| "epoch": 0.1208, | |
| "grad_norm": 4.667119011332929, | |
| "kl": 2.3486328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0428, | |
| "reward": 0.27773438883014023, | |
| "reward_std": 0.47337998705916107, | |
| "rewards/end_of_conversation_reward_func": 0.06093750102445483, | |
| "rewards/end_rm_reward_func": 0.240234375, | |
| "rewards/length_reward_func": -0.0234375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0058815906959353015, | |
| "epoch": 0.1216, | |
| "grad_norm": 179.58027001282468, | |
| "kl": 15.09307861328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0682, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3117.2890625, | |
| "epoch": 0.1224, | |
| "grad_norm": 0.7534678082177136, | |
| "kl": 12.952392578125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0408, | |
| "reward": 0.22148438543081284, | |
| "reward_std": 0.42119756643660367, | |
| "rewards/end_of_conversation_reward_func": 0.059375000884756446, | |
| "rewards/end_rm_reward_func": 0.220703125, | |
| "rewards/length_reward_func": -0.05859375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.004391293317894451, | |
| "epoch": 0.1232, | |
| "grad_norm": 0.5828739595215712, | |
| "kl": 8.2723388671875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0403, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2844.5234375, | |
| "epoch": 0.124, | |
| "grad_norm": 0.716894901784797, | |
| "kl": 7.204345703125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0173, | |
| "reward": 0.35722657898440957, | |
| "reward_std": 0.3928870742674917, | |
| "rewards/end_of_conversation_reward_func": 0.07500000135041773, | |
| "rewards/end_rm_reward_func": 0.3369140625, | |
| "rewards/length_reward_func": -0.0546875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.004953728028340265, | |
| "epoch": 0.1248, | |
| "grad_norm": 0.7207403983692948, | |
| "kl": 2.5550537109375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0171, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2347.9765625, | |
| "epoch": 0.1256, | |
| "grad_norm": 0.8185732891233355, | |
| "kl": 1.7054443359375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0532, | |
| "reward": 0.2826172022614628, | |
| "reward_std": 0.4683522223494947, | |
| "rewards/end_of_conversation_reward_func": 0.06484375114087015, | |
| "rewards/end_rm_reward_func": 0.2529296875, | |
| "rewards/length_reward_func": -0.03515625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.006606276831007563, | |
| "epoch": 0.1264, | |
| "grad_norm": 0.728399034605529, | |
| "kl": 1.5609130859375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0526, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2569.265625, | |
| "epoch": 0.1272, | |
| "grad_norm": 0.7190757659849589, | |
| "kl": 0.906005859375, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0206, | |
| "reward": 0.2970703258179128, | |
| "reward_std": 0.3366057106759399, | |
| "rewards/end_of_conversation_reward_func": 0.06171875074505806, | |
| "rewards/end_rm_reward_func": 0.2626953125, | |
| "rewards/length_reward_func": -0.02734375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.005102125694975257, | |
| "epoch": 0.128, | |
| "grad_norm": 0.6672444048962334, | |
| "kl": 0.8017578125, | |
| "learning_rate": 5e-07, | |
| "loss": -0.021, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2620.1484375, | |
| "epoch": 0.1288, | |
| "grad_norm": 0.7595254995251381, | |
| "kl": 1.82080078125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0537, | |
| "reward": 0.3201171928085387, | |
| "reward_std": 0.4111205171793699, | |
| "rewards/end_of_conversation_reward_func": 0.07109375111758709, | |
| "rewards/end_rm_reward_func": 0.2724609375, | |
| "rewards/length_reward_func": -0.0234375, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.004954093805281445, | |
| "epoch": 0.1296, | |
| "grad_norm": 0.743108178535059, | |
| "kl": 2.1588134765625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0532, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2186.5078125, | |
| "epoch": 0.1304, | |
| "grad_norm": 1.324960021503415, | |
| "kl": 3.0037841796875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0527, | |
| "reward": 0.3619140777736902, | |
| "reward_std": 0.38691631401889026, | |
| "rewards/end_of_conversation_reward_func": 0.06015625048894435, | |
| "rewards/end_rm_reward_func": 0.3056640625, | |
| "rewards/length_reward_func": -0.00390625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.005645596262183972, | |
| "epoch": 0.1312, | |
| "grad_norm": 1.0819168226765692, | |
| "kl": 10.2042236328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.052, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2619.4140625, | |
| "epoch": 0.132, | |
| "grad_norm": 0.7079010578463608, | |
| "kl": 1.164794921875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0719, | |
| "reward": 0.3517578286700882, | |
| "reward_std": 0.444250165252015, | |
| "rewards/end_of_conversation_reward_func": 0.0695312509778887, | |
| "rewards/end_rm_reward_func": 0.3251953125, | |
| "rewards/length_reward_func": -0.04296875, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.004916784484521486, | |
| "epoch": 0.1328, | |
| "grad_norm": 0.667503324712686, | |
| "kl": 1.4266357421875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0714, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2411.8671875, | |
| "epoch": 0.1336, | |
| "grad_norm": 2.7317288434410036, | |
| "kl": 2.0714111328125, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1114, | |
| "reward": 0.21445313666481525, | |
| "reward_std": 0.3326614680700004, | |
| "rewards/end_of_conversation_reward_func": 0.06796875107102096, | |
| "rewards/end_rm_reward_func": 0.166015625, | |
| "rewards/length_reward_func": -0.01953125, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0053935582836857066, | |
| "epoch": 0.1344, | |
| "grad_norm": 1.3387444314477779, | |
| "kl": 1.496337890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1094, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2000.3359375, | |
| "epoch": 0.1352, | |
| "grad_norm": 159.24137003595405, | |
| "kl": 56.654296875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1291, | |
| "reward": 0.42187501955777407, | |
| "reward_std": 0.47698789834976196, | |
| "rewards/end_of_conversation_reward_func": 0.0664062510477379, | |
| "rewards/end_rm_reward_func": 0.359375, | |
| "rewards/length_reward_func": -0.00390625, | |
| "rewards/thinking_reward_func": 0.0, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.006557086948305368, | |
| "epoch": 0.136, | |
| "grad_norm": 6.8095683630193635, | |
| "kl": 18.90185546875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0603, | |
| "step": 170 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |