{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 5, "global_step": 494, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004048582995951417, "grad_norm": 57.285691125278106, "learning_rate": 2e-05, "loss": 20.1406, "mean_token_accuracy": 0.5111725330352783, "num_tokens": 72429.0, "step": 1 }, { "epoch": 0.008097165991902834, "grad_norm": 41.99363066045992, "learning_rate": 1.9959514170040488e-05, "loss": 18.6875, "mean_token_accuracy": 0.5240641832351685, "num_tokens": 136634.0, "step": 2 }, { "epoch": 0.012145748987854251, "grad_norm": 38.5726807470561, "learning_rate": 1.9919028340080974e-05, "loss": 19.4375, "mean_token_accuracy": 0.5193748474121094, "num_tokens": 204776.0, "step": 3 }, { "epoch": 0.016194331983805668, "grad_norm": 36.354880235444575, "learning_rate": 1.987854251012146e-05, "loss": 18.5859, "mean_token_accuracy": 0.5202323198318481, "num_tokens": 271293.0, "step": 4 }, { "epoch": 0.020242914979757085, "grad_norm": 28.213868324309114, "learning_rate": 1.9838056680161946e-05, "loss": 17.4609, "mean_token_accuracy": 0.5324159860610962, "num_tokens": 332546.0, "step": 5 }, { "epoch": 0.020242914979757085, "eval_loss": 2.1596875190734863, "eval_mean_token_accuracy": 0.5420514196157455, "eval_num_tokens": 332546.0, "eval_runtime": 3.902, "eval_samples_per_second": 51.256, "eval_steps_per_second": 1.025, "step": 5 }, { "epoch": 0.024291497975708502, "grad_norm": 27.134077029097234, "learning_rate": 1.979757085020243e-05, "loss": 17.4922, "mean_token_accuracy": 0.5339857339859009, "num_tokens": 402301.0, "step": 6 }, { "epoch": 0.02834008097165992, "grad_norm": 18.222480703668843, "learning_rate": 1.9757085020242915e-05, "loss": 17.1172, "mean_token_accuracy": 0.5464459657669067, "num_tokens": 461692.0, "step": 7 }, { "epoch": 0.032388663967611336, "grad_norm": 15.082689481608242, "learning_rate": 1.9716599190283405e-05, "loss": 16.8516, "mean_token_accuracy": 0.5461285710334778, "num_tokens": 538227.0, "step": 8 }, { "epoch": 0.03643724696356275, "grad_norm": 12.197205828835985, "learning_rate": 1.9676113360323887e-05, "loss": 16.1562, "mean_token_accuracy": 0.5573914647102356, "num_tokens": 603109.0, "step": 9 }, { "epoch": 0.04048582995951417, "grad_norm": 13.717242431104397, "learning_rate": 1.9635627530364373e-05, "loss": 15.7266, "mean_token_accuracy": 0.5678688287734985, "num_tokens": 659598.0, "step": 10 }, { "epoch": 0.04048582995951417, "eval_loss": 2.0374999046325684, "eval_mean_token_accuracy": 0.5590415745973587, "eval_num_tokens": 659598.0, "eval_runtime": 0.6198, "eval_samples_per_second": 322.698, "eval_steps_per_second": 6.454, "step": 10 }, { "epoch": 0.044534412955465584, "grad_norm": 12.306486360056866, "learning_rate": 1.959514170040486e-05, "loss": 15.6484, "mean_token_accuracy": 0.5803827047348022, "num_tokens": 714481.0, "step": 11 }, { "epoch": 0.048582995951417005, "grad_norm": 11.708980436467652, "learning_rate": 1.9554655870445346e-05, "loss": 15.4062, "mean_token_accuracy": 0.578364908695221, "num_tokens": 778432.0, "step": 12 }, { "epoch": 0.05263157894736842, "grad_norm": 11.35292985675568, "learning_rate": 1.9514170040485832e-05, "loss": 16.2422, "mean_token_accuracy": 0.5603257417678833, "num_tokens": 843576.0, "step": 13 }, { "epoch": 0.05668016194331984, "grad_norm": 11.721306843681429, "learning_rate": 1.9473684210526318e-05, "loss": 15.4141, "mean_token_accuracy": 0.5766724944114685, "num_tokens": 914252.0, "step": 14 }, { "epoch": 0.06072874493927125, "grad_norm": 11.371453682604804, "learning_rate": 1.94331983805668e-05, "loss": 15.2969, "mean_token_accuracy": 0.5723595023155212, "num_tokens": 976160.0, "step": 15 }, { "epoch": 0.06072874493927125, "eval_loss": 1.9924999475479126, "eval_mean_token_accuracy": 0.5654653459787369, "eval_num_tokens": 976160.0, "eval_runtime": 0.6214, "eval_samples_per_second": 321.855, "eval_steps_per_second": 6.437, "step": 15 }, { "epoch": 0.06477732793522267, "grad_norm": 13.380354993808568, "learning_rate": 1.939271255060729e-05, "loss": 17.2188, "mean_token_accuracy": 0.5414544939994812, "num_tokens": 1046132.0, "step": 16 }, { "epoch": 0.06882591093117409, "grad_norm": 12.756822173097444, "learning_rate": 1.9352226720647776e-05, "loss": 15.7188, "mean_token_accuracy": 0.5702171325683594, "num_tokens": 1111408.0, "step": 17 }, { "epoch": 0.0728744939271255, "grad_norm": 10.541413178483012, "learning_rate": 1.931174089068826e-05, "loss": 16.6406, "mean_token_accuracy": 0.5545904040336609, "num_tokens": 1181008.0, "step": 18 }, { "epoch": 0.07692307692307693, "grad_norm": 9.058216089424022, "learning_rate": 1.9271255060728745e-05, "loss": 15.4922, "mean_token_accuracy": 0.5672646164894104, "num_tokens": 1242895.0, "step": 19 }, { "epoch": 0.08097165991902834, "grad_norm": 8.224696140023758, "learning_rate": 1.923076923076923e-05, "loss": 14.5, "mean_token_accuracy": 0.5876371264457703, "num_tokens": 1321339.0, "step": 20 }, { "epoch": 0.08097165991902834, "eval_loss": 1.959375023841858, "eval_mean_token_accuracy": 0.5695992410182953, "eval_num_tokens": 1321339.0, "eval_runtime": 0.6175, "eval_samples_per_second": 323.911, "eval_steps_per_second": 6.478, "step": 20 }, { "epoch": 0.08502024291497975, "grad_norm": 9.194280544253612, "learning_rate": 1.9190283400809718e-05, "loss": 15.7969, "mean_token_accuracy": 0.5668182373046875, "num_tokens": 1388271.0, "step": 21 }, { "epoch": 0.08906882591093117, "grad_norm": 9.173673841849936, "learning_rate": 1.9149797570850204e-05, "loss": 16.1875, "mean_token_accuracy": 0.5593817234039307, "num_tokens": 1456462.0, "step": 22 }, { "epoch": 0.0931174089068826, "grad_norm": 9.571035474364693, "learning_rate": 1.910931174089069e-05, "loss": 15.8516, "mean_token_accuracy": 0.5684004426002502, "num_tokens": 1518799.0, "step": 23 }, { "epoch": 0.09716599190283401, "grad_norm": 7.975715179302099, "learning_rate": 1.9068825910931176e-05, "loss": 14.9844, "mean_token_accuracy": 0.5802298188209534, "num_tokens": 1588301.0, "step": 24 }, { "epoch": 0.10121457489878542, "grad_norm": 8.06780763084373, "learning_rate": 1.9028340080971662e-05, "loss": 15.9766, "mean_token_accuracy": 0.5611966848373413, "num_tokens": 1665044.0, "step": 25 }, { "epoch": 0.10121457489878542, "eval_loss": 1.9378124475479126, "eval_mean_token_accuracy": 0.571557804942131, "eval_num_tokens": 1665044.0, "eval_runtime": 0.6132, "eval_samples_per_second": 326.145, "eval_steps_per_second": 6.523, "step": 25 }, { "epoch": 0.10526315789473684, "grad_norm": 9.003512171360999, "learning_rate": 1.8987854251012148e-05, "loss": 15.9062, "mean_token_accuracy": 0.5626139640808105, "num_tokens": 1731459.0, "step": 26 }, { "epoch": 0.10931174089068826, "grad_norm": 8.045437012882436, "learning_rate": 1.894736842105263e-05, "loss": 14.9844, "mean_token_accuracy": 0.5767053365707397, "num_tokens": 1794413.0, "step": 27 }, { "epoch": 0.11336032388663968, "grad_norm": 7.791818905787103, "learning_rate": 1.8906882591093117e-05, "loss": 16.0703, "mean_token_accuracy": 0.5646764636039734, "num_tokens": 1864000.0, "step": 28 }, { "epoch": 0.11740890688259109, "grad_norm": 7.899247736502825, "learning_rate": 1.8866396761133607e-05, "loss": 14.6562, "mean_token_accuracy": 0.5848944187164307, "num_tokens": 1926306.0, "step": 29 }, { "epoch": 0.1214574898785425, "grad_norm": 9.046927442368185, "learning_rate": 1.882591093117409e-05, "loss": 15.0781, "mean_token_accuracy": 0.5783066749572754, "num_tokens": 2001619.0, "step": 30 }, { "epoch": 0.1214574898785425, "eval_loss": 1.9221874475479126, "eval_mean_token_accuracy": 0.5738949328660965, "eval_num_tokens": 2001619.0, "eval_runtime": 0.6128, "eval_samples_per_second": 326.396, "eval_steps_per_second": 6.528, "step": 30 }, { "epoch": 0.12550607287449392, "grad_norm": 8.961465962055085, "learning_rate": 1.8785425101214576e-05, "loss": 16.1797, "mean_token_accuracy": 0.5587651133537292, "num_tokens": 2066109.0, "step": 31 }, { "epoch": 0.12955465587044535, "grad_norm": 7.562379064010745, "learning_rate": 1.874493927125506e-05, "loss": 15.4922, "mean_token_accuracy": 0.5650998950004578, "num_tokens": 2142786.0, "step": 32 }, { "epoch": 0.13360323886639677, "grad_norm": 8.562712496801407, "learning_rate": 1.8704453441295548e-05, "loss": 14.9219, "mean_token_accuracy": 0.5784920454025269, "num_tokens": 2199722.0, "step": 33 }, { "epoch": 0.13765182186234817, "grad_norm": 9.988663916747383, "learning_rate": 1.8663967611336034e-05, "loss": 17.1641, "mean_token_accuracy": 0.5491723418235779, "num_tokens": 2275906.0, "step": 34 }, { "epoch": 0.1417004048582996, "grad_norm": 7.709526019356417, "learning_rate": 1.862348178137652e-05, "loss": 15.7656, "mean_token_accuracy": 0.5621457099914551, "num_tokens": 2358888.0, "step": 35 }, { "epoch": 0.1417004048582996, "eval_loss": 1.9071874618530273, "eval_mean_token_accuracy": 0.5757417529821396, "eval_num_tokens": 2358888.0, "eval_runtime": 0.6136, "eval_samples_per_second": 325.959, "eval_steps_per_second": 6.519, "step": 35 }, { "epoch": 0.145748987854251, "grad_norm": 8.03798824892872, "learning_rate": 1.8582995951417006e-05, "loss": 15.8047, "mean_token_accuracy": 0.5641842484474182, "num_tokens": 2428938.0, "step": 36 }, { "epoch": 0.14979757085020243, "grad_norm": 8.32083756121802, "learning_rate": 1.8542510121457492e-05, "loss": 15.2578, "mean_token_accuracy": 0.5743291974067688, "num_tokens": 2493963.0, "step": 37 }, { "epoch": 0.15384615384615385, "grad_norm": 8.354287332937643, "learning_rate": 1.850202429149798e-05, "loss": 15.4141, "mean_token_accuracy": 0.5688024759292603, "num_tokens": 2559904.0, "step": 38 }, { "epoch": 0.15789473684210525, "grad_norm": 8.579778574791483, "learning_rate": 1.8461538461538465e-05, "loss": 15.5078, "mean_token_accuracy": 0.5710554122924805, "num_tokens": 2614334.0, "step": 39 }, { "epoch": 0.16194331983805668, "grad_norm": 8.994514132460578, "learning_rate": 1.8421052631578947e-05, "loss": 14.1875, "mean_token_accuracy": 0.5983923673629761, "num_tokens": 2677721.0, "step": 40 }, { "epoch": 0.16194331983805668, "eval_loss": 1.894374966621399, "eval_mean_token_accuracy": 0.577251747250557, "eval_num_tokens": 2677721.0, "eval_runtime": 0.6381, "eval_samples_per_second": 313.434, "eval_steps_per_second": 6.269, "step": 40 }, { "epoch": 0.1659919028340081, "grad_norm": 9.563884911162102, "learning_rate": 1.8380566801619433e-05, "loss": 15.8906, "mean_token_accuracy": 0.5601885914802551, "num_tokens": 2736542.0, "step": 41 }, { "epoch": 0.1700404858299595, "grad_norm": 8.720473365423832, "learning_rate": 1.8340080971659923e-05, "loss": 14.7969, "mean_token_accuracy": 0.5805999636650085, "num_tokens": 2795142.0, "step": 42 }, { "epoch": 0.17408906882591094, "grad_norm": 7.269013839644517, "learning_rate": 1.8299595141700406e-05, "loss": 14.7031, "mean_token_accuracy": 0.578359067440033, "num_tokens": 2863022.0, "step": 43 }, { "epoch": 0.17813765182186234, "grad_norm": 7.272014286398225, "learning_rate": 1.8259109311740892e-05, "loss": 14.8516, "mean_token_accuracy": 0.5769524574279785, "num_tokens": 2931180.0, "step": 44 }, { "epoch": 0.18218623481781376, "grad_norm": 8.851418056520739, "learning_rate": 1.8218623481781378e-05, "loss": 15.3594, "mean_token_accuracy": 0.5745086669921875, "num_tokens": 2995713.0, "step": 45 }, { "epoch": 0.18218623481781376, "eval_loss": 1.885312557220459, "eval_mean_token_accuracy": 0.5781335979700089, "eval_num_tokens": 2995713.0, "eval_runtime": 0.6226, "eval_samples_per_second": 321.249, "eval_steps_per_second": 6.425, "step": 45 }, { "epoch": 0.1862348178137652, "grad_norm": 8.587589121567573, "learning_rate": 1.8178137651821864e-05, "loss": 14.9531, "mean_token_accuracy": 0.5738093852996826, "num_tokens": 3066642.0, "step": 46 }, { "epoch": 0.1902834008097166, "grad_norm": 7.890409770531286, "learning_rate": 1.813765182186235e-05, "loss": 15.1016, "mean_token_accuracy": 0.5790749192237854, "num_tokens": 3128603.0, "step": 47 }, { "epoch": 0.19433198380566802, "grad_norm": 7.793678892191199, "learning_rate": 1.8097165991902836e-05, "loss": 14.8281, "mean_token_accuracy": 0.5814535021781921, "num_tokens": 3187449.0, "step": 48 }, { "epoch": 0.19838056680161945, "grad_norm": 7.256161611883218, "learning_rate": 1.805668016194332e-05, "loss": 14.9062, "mean_token_accuracy": 0.5790348649024963, "num_tokens": 3262113.0, "step": 49 }, { "epoch": 0.20242914979757085, "grad_norm": 7.3121519125177254, "learning_rate": 1.801619433198381e-05, "loss": 14.6484, "mean_token_accuracy": 0.5866994261741638, "num_tokens": 3332956.0, "step": 50 }, { "epoch": 0.20242914979757085, "eval_loss": 1.8756250143051147, "eval_mean_token_accuracy": 0.5786750316619873, "eval_num_tokens": 3332956.0, "eval_runtime": 0.616, "eval_samples_per_second": 324.656, "eval_steps_per_second": 6.493, "step": 50 }, { "epoch": 0.20647773279352227, "grad_norm": 7.4168976628601095, "learning_rate": 1.7975708502024295e-05, "loss": 15.9141, "mean_token_accuracy": 0.5570866465568542, "num_tokens": 3402686.0, "step": 51 }, { "epoch": 0.21052631578947367, "grad_norm": 8.132782257191536, "learning_rate": 1.7935222672064778e-05, "loss": 13.6328, "mean_token_accuracy": 0.6026627421379089, "num_tokens": 3463439.0, "step": 52 }, { "epoch": 0.2145748987854251, "grad_norm": 8.722536735498021, "learning_rate": 1.7894736842105264e-05, "loss": 15.9922, "mean_token_accuracy": 0.5636070370674133, "num_tokens": 3534808.0, "step": 53 }, { "epoch": 0.21862348178137653, "grad_norm": 9.5925704794369, "learning_rate": 1.785425101214575e-05, "loss": 16.5859, "mean_token_accuracy": 0.5475066900253296, "num_tokens": 3598221.0, "step": 54 }, { "epoch": 0.22267206477732793, "grad_norm": 8.403059172595993, "learning_rate": 1.7813765182186236e-05, "loss": 15.0859, "mean_token_accuracy": 0.5766125321388245, "num_tokens": 3667399.0, "step": 55 }, { "epoch": 0.22267206477732793, "eval_loss": 1.8668750524520874, "eval_mean_token_accuracy": 0.5796706676483154, "eval_num_tokens": 3667399.0, "eval_runtime": 0.6158, "eval_samples_per_second": 324.774, "eval_steps_per_second": 6.495, "step": 55 }, { "epoch": 0.22672064777327935, "grad_norm": 7.854856510906883, "learning_rate": 1.7773279352226722e-05, "loss": 14.5781, "mean_token_accuracy": 0.5866025686264038, "num_tokens": 3741537.0, "step": 56 }, { "epoch": 0.23076923076923078, "grad_norm": 9.026696458622231, "learning_rate": 1.7732793522267208e-05, "loss": 15.4062, "mean_token_accuracy": 0.5732284784317017, "num_tokens": 3817002.0, "step": 57 }, { "epoch": 0.23481781376518218, "grad_norm": 8.576517147098428, "learning_rate": 1.7692307692307694e-05, "loss": 14.7969, "mean_token_accuracy": 0.577850878238678, "num_tokens": 3875145.0, "step": 58 }, { "epoch": 0.2388663967611336, "grad_norm": 9.62049525740993, "learning_rate": 1.765182186234818e-05, "loss": 14.9531, "mean_token_accuracy": 0.5829423069953918, "num_tokens": 3947880.0, "step": 59 }, { "epoch": 0.242914979757085, "grad_norm": 7.546747277273659, "learning_rate": 1.7611336032388667e-05, "loss": 13.5547, "mean_token_accuracy": 0.6001456379890442, "num_tokens": 4015226.0, "step": 60 }, { "epoch": 0.242914979757085, "eval_loss": 1.859375, "eval_mean_token_accuracy": 0.5801091939210892, "eval_num_tokens": 4015226.0, "eval_runtime": 0.6356, "eval_samples_per_second": 314.646, "eval_steps_per_second": 6.293, "step": 60 }, { "epoch": 0.24696356275303644, "grad_norm": 8.432094147883522, "learning_rate": 1.757085020242915e-05, "loss": 14.3438, "mean_token_accuracy": 0.5897209048271179, "num_tokens": 4076385.0, "step": 61 }, { "epoch": 0.25101214574898784, "grad_norm": 8.727387350173785, "learning_rate": 1.7530364372469636e-05, "loss": 14.0859, "mean_token_accuracy": 0.5868391394615173, "num_tokens": 4130336.0, "step": 62 }, { "epoch": 0.2550607287449393, "grad_norm": 7.853461601699973, "learning_rate": 1.7489878542510125e-05, "loss": 14.5625, "mean_token_accuracy": 0.5877260565757751, "num_tokens": 4205714.0, "step": 63 }, { "epoch": 0.2591093117408907, "grad_norm": 10.618114952893986, "learning_rate": 1.7449392712550608e-05, "loss": 14.7891, "mean_token_accuracy": 0.5806258916854858, "num_tokens": 4276016.0, "step": 64 }, { "epoch": 0.2631578947368421, "grad_norm": 8.368519887093084, "learning_rate": 1.7408906882591094e-05, "loss": 14.6094, "mean_token_accuracy": 0.5852103233337402, "num_tokens": 4341166.0, "step": 65 }, { "epoch": 0.2631578947368421, "eval_loss": 1.8528125286102295, "eval_mean_token_accuracy": 0.5809510350227356, "eval_num_tokens": 4341166.0, "eval_runtime": 0.6479, "eval_samples_per_second": 308.698, "eval_steps_per_second": 6.174, "step": 65 }, { "epoch": 0.26720647773279355, "grad_norm": 8.961964950224539, "learning_rate": 1.736842105263158e-05, "loss": 13.7656, "mean_token_accuracy": 0.5963848829269409, "num_tokens": 4394174.0, "step": 66 }, { "epoch": 0.27125506072874495, "grad_norm": 8.15926394629814, "learning_rate": 1.7327935222672066e-05, "loss": 14.8906, "mean_token_accuracy": 0.5760456323623657, "num_tokens": 4463144.0, "step": 67 }, { "epoch": 0.27530364372469635, "grad_norm": 7.093290877100628, "learning_rate": 1.7287449392712552e-05, "loss": 14.375, "mean_token_accuracy": 0.5898444056510925, "num_tokens": 4533140.0, "step": 68 }, { "epoch": 0.2793522267206478, "grad_norm": 8.262028322497853, "learning_rate": 1.724696356275304e-05, "loss": 15.3672, "mean_token_accuracy": 0.5701308250427246, "num_tokens": 4596279.0, "step": 69 }, { "epoch": 0.2834008097165992, "grad_norm": 7.614011800242887, "learning_rate": 1.720647773279352e-05, "loss": 13.5547, "mean_token_accuracy": 0.5994280576705933, "num_tokens": 4661033.0, "step": 70 }, { "epoch": 0.2834008097165992, "eval_loss": 1.8446874618530273, "eval_mean_token_accuracy": 0.5816306471824646, "eval_num_tokens": 4661033.0, "eval_runtime": 0.6219, "eval_samples_per_second": 321.57, "eval_steps_per_second": 6.431, "step": 70 }, { "epoch": 0.2874493927125506, "grad_norm": 8.501649392228297, "learning_rate": 1.716599190283401e-05, "loss": 14.5312, "mean_token_accuracy": 0.5782850384712219, "num_tokens": 4730708.0, "step": 71 }, { "epoch": 0.291497975708502, "grad_norm": 7.022173928165628, "learning_rate": 1.7125506072874497e-05, "loss": 14.8203, "mean_token_accuracy": 0.5837699770927429, "num_tokens": 4804253.0, "step": 72 }, { "epoch": 0.29554655870445345, "grad_norm": 7.489428149228862, "learning_rate": 1.708502024291498e-05, "loss": 13.9219, "mean_token_accuracy": 0.594904363155365, "num_tokens": 4873160.0, "step": 73 }, { "epoch": 0.29959514170040485, "grad_norm": 8.856719704451924, "learning_rate": 1.7044534412955466e-05, "loss": 15.5781, "mean_token_accuracy": 0.5685590505599976, "num_tokens": 4942325.0, "step": 74 }, { "epoch": 0.30364372469635625, "grad_norm": 7.197286431718064, "learning_rate": 1.7004048582995952e-05, "loss": 14.4922, "mean_token_accuracy": 0.5876879096031189, "num_tokens": 5014497.0, "step": 75 }, { "epoch": 0.30364372469635625, "eval_loss": 1.8378125429153442, "eval_mean_token_accuracy": 0.5824010521173477, "eval_num_tokens": 5014497.0, "eval_runtime": 0.6338, "eval_samples_per_second": 315.564, "eval_steps_per_second": 6.311, "step": 75 }, { "epoch": 0.3076923076923077, "grad_norm": 9.04302504332728, "learning_rate": 1.6963562753036438e-05, "loss": 15.4844, "mean_token_accuracy": 0.571445107460022, "num_tokens": 5075118.0, "step": 76 }, { "epoch": 0.3117408906882591, "grad_norm": 7.558608724226377, "learning_rate": 1.6923076923076924e-05, "loss": 14.5156, "mean_token_accuracy": 0.5831646919250488, "num_tokens": 5142386.0, "step": 77 }, { "epoch": 0.3157894736842105, "grad_norm": 7.8941192536661955, "learning_rate": 1.688259109311741e-05, "loss": 13.6406, "mean_token_accuracy": 0.5994213223457336, "num_tokens": 5208462.0, "step": 78 }, { "epoch": 0.31983805668016196, "grad_norm": 7.560860397902134, "learning_rate": 1.6842105263157896e-05, "loss": 15.3047, "mean_token_accuracy": 0.5730275511741638, "num_tokens": 5285004.0, "step": 79 }, { "epoch": 0.32388663967611336, "grad_norm": 7.257123559107278, "learning_rate": 1.6801619433198383e-05, "loss": 14.5312, "mean_token_accuracy": 0.586321234703064, "num_tokens": 5349606.0, "step": 80 }, { "epoch": 0.32388663967611336, "eval_loss": 1.8290624618530273, "eval_mean_token_accuracy": 0.583142414689064, "eval_num_tokens": 5349606.0, "eval_runtime": 0.6206, "eval_samples_per_second": 322.278, "eval_steps_per_second": 6.446, "step": 80 }, { "epoch": 0.32793522267206476, "grad_norm": 8.385990372203198, "learning_rate": 1.676113360323887e-05, "loss": 15.8828, "mean_token_accuracy": 0.566261887550354, "num_tokens": 5413817.0, "step": 81 }, { "epoch": 0.3319838056680162, "grad_norm": 8.40823722369304, "learning_rate": 1.672064777327935e-05, "loss": 15.5781, "mean_token_accuracy": 0.5658481121063232, "num_tokens": 5487505.0, "step": 82 }, { "epoch": 0.3360323886639676, "grad_norm": 8.204736370997345, "learning_rate": 1.6680161943319838e-05, "loss": 13.2969, "mean_token_accuracy": 0.6081260442733765, "num_tokens": 5549321.0, "step": 83 }, { "epoch": 0.340080971659919, "grad_norm": 7.713082914361621, "learning_rate": 1.6639676113360327e-05, "loss": 14.375, "mean_token_accuracy": 0.5855867266654968, "num_tokens": 5612660.0, "step": 84 }, { "epoch": 0.3441295546558704, "grad_norm": 9.49322308557619, "learning_rate": 1.659919028340081e-05, "loss": 15.6953, "mean_token_accuracy": 0.5663869380950928, "num_tokens": 5685773.0, "step": 85 }, { "epoch": 0.3441295546558704, "eval_loss": 1.8246874809265137, "eval_mean_token_accuracy": 0.5833836942911148, "eval_num_tokens": 5685773.0, "eval_runtime": 0.6299, "eval_samples_per_second": 317.535, "eval_steps_per_second": 6.351, "step": 85 }, { "epoch": 0.3481781376518219, "grad_norm": 8.979306474955873, "learning_rate": 1.6558704453441296e-05, "loss": 14.8125, "mean_token_accuracy": 0.5798248648643494, "num_tokens": 5746238.0, "step": 86 }, { "epoch": 0.3522267206477733, "grad_norm": 7.582176162306731, "learning_rate": 1.6518218623481782e-05, "loss": 15.5625, "mean_token_accuracy": 0.5640476942062378, "num_tokens": 5814993.0, "step": 87 }, { "epoch": 0.3562753036437247, "grad_norm": 8.163709779919543, "learning_rate": 1.6477732793522268e-05, "loss": 15.125, "mean_token_accuracy": 0.5744800567626953, "num_tokens": 5884344.0, "step": 88 }, { "epoch": 0.3603238866396761, "grad_norm": 7.597205625950287, "learning_rate": 1.6437246963562754e-05, "loss": 15.1172, "mean_token_accuracy": 0.5687373876571655, "num_tokens": 5954319.0, "step": 89 }, { "epoch": 0.3643724696356275, "grad_norm": 7.2482847160987625, "learning_rate": 1.639676113360324e-05, "loss": 13.2422, "mean_token_accuracy": 0.6023260354995728, "num_tokens": 6019645.0, "step": 90 }, { "epoch": 0.3643724696356275, "eval_loss": 1.8174999952316284, "eval_mean_token_accuracy": 0.5845800191164017, "eval_num_tokens": 6019645.0, "eval_runtime": 0.6235, "eval_samples_per_second": 320.756, "eval_steps_per_second": 6.415, "step": 90 }, { "epoch": 0.3684210526315789, "grad_norm": 8.841871793018052, "learning_rate": 1.6356275303643723e-05, "loss": 13.7266, "mean_token_accuracy": 0.6017288565635681, "num_tokens": 6075352.0, "step": 91 }, { "epoch": 0.3724696356275304, "grad_norm": 8.434804871321719, "learning_rate": 1.6315789473684213e-05, "loss": 14.0547, "mean_token_accuracy": 0.5935065746307373, "num_tokens": 6145764.0, "step": 92 }, { "epoch": 0.3765182186234818, "grad_norm": 7.730347182665673, "learning_rate": 1.62753036437247e-05, "loss": 14.6641, "mean_token_accuracy": 0.5870702862739563, "num_tokens": 6210408.0, "step": 93 }, { "epoch": 0.3805668016194332, "grad_norm": 7.45273814710918, "learning_rate": 1.6234817813765185e-05, "loss": 14.2422, "mean_token_accuracy": 0.5905075669288635, "num_tokens": 6279074.0, "step": 94 }, { "epoch": 0.38461538461538464, "grad_norm": 11.1983859182283, "learning_rate": 1.6194331983805668e-05, "loss": 15.3906, "mean_token_accuracy": 0.5667217373847961, "num_tokens": 6348733.0, "step": 95 }, { "epoch": 0.38461538461538464, "eval_loss": 1.8106249570846558, "eval_mean_token_accuracy": 0.5847904682159424, "eval_num_tokens": 6348733.0, "eval_runtime": 0.6596, "eval_samples_per_second": 303.232, "eval_steps_per_second": 6.065, "step": 95 }, { "epoch": 0.38866396761133604, "grad_norm": 6.824860263185756, "learning_rate": 1.6153846153846154e-05, "loss": 13.8672, "mean_token_accuracy": 0.5981316566467285, "num_tokens": 6418698.0, "step": 96 }, { "epoch": 0.39271255060728744, "grad_norm": 7.431961140378814, "learning_rate": 1.6113360323886644e-05, "loss": 15.1719, "mean_token_accuracy": 0.5711833834648132, "num_tokens": 6488097.0, "step": 97 }, { "epoch": 0.3967611336032389, "grad_norm": 8.677956082707139, "learning_rate": 1.6072874493927126e-05, "loss": 14.4922, "mean_token_accuracy": 0.585637092590332, "num_tokens": 6549764.0, "step": 98 }, { "epoch": 0.4008097165991903, "grad_norm": 8.646021653592204, "learning_rate": 1.6032388663967612e-05, "loss": 14.75, "mean_token_accuracy": 0.5855600833892822, "num_tokens": 6609482.0, "step": 99 }, { "epoch": 0.4048582995951417, "grad_norm": 7.907811817137269, "learning_rate": 1.59919028340081e-05, "loss": 15.0469, "mean_token_accuracy": 0.5750304460525513, "num_tokens": 6679311.0, "step": 100 }, { "epoch": 0.4048582995951417, "eval_loss": 1.807187557220459, "eval_mean_token_accuracy": 0.585128903388977, "eval_num_tokens": 6679311.0, "eval_runtime": 0.6262, "eval_samples_per_second": 319.397, "eval_steps_per_second": 6.388, "step": 100 }, { "epoch": 0.4089068825910931, "grad_norm": 7.835498999161377, "learning_rate": 1.5951417004048585e-05, "loss": 15.0156, "mean_token_accuracy": 0.5781442523002625, "num_tokens": 6736916.0, "step": 101 }, { "epoch": 0.41295546558704455, "grad_norm": 7.972192619348582, "learning_rate": 1.591093117408907e-05, "loss": 14.3438, "mean_token_accuracy": 0.590062141418457, "num_tokens": 6802185.0, "step": 102 }, { "epoch": 0.41700404858299595, "grad_norm": 7.983491081447614, "learning_rate": 1.5870445344129557e-05, "loss": 14.75, "mean_token_accuracy": 0.579732358455658, "num_tokens": 6863598.0, "step": 103 }, { "epoch": 0.42105263157894735, "grad_norm": 10.01408214824878, "learning_rate": 1.582995951417004e-05, "loss": 14.9219, "mean_token_accuracy": 0.5742595791816711, "num_tokens": 6927748.0, "step": 104 }, { "epoch": 0.4251012145748988, "grad_norm": 7.779179257118591, "learning_rate": 1.578947368421053e-05, "loss": 13.8125, "mean_token_accuracy": 0.5934587121009827, "num_tokens": 6993579.0, "step": 105 }, { "epoch": 0.4251012145748988, "eval_loss": 1.8018749952316284, "eval_mean_token_accuracy": 0.5855304300785065, "eval_num_tokens": 6993579.0, "eval_runtime": 0.624, "eval_samples_per_second": 320.53, "eval_steps_per_second": 6.411, "step": 105 }, { "epoch": 0.4291497975708502, "grad_norm": 7.766216017833367, "learning_rate": 1.5748987854251015e-05, "loss": 14.9453, "mean_token_accuracy": 0.5719259977340698, "num_tokens": 7064007.0, "step": 106 }, { "epoch": 0.4331983805668016, "grad_norm": 8.87590967566746, "learning_rate": 1.5708502024291498e-05, "loss": 14.1328, "mean_token_accuracy": 0.5912887454032898, "num_tokens": 7126932.0, "step": 107 }, { "epoch": 0.43724696356275305, "grad_norm": 7.2824506866551015, "learning_rate": 1.5668016194331984e-05, "loss": 12.9688, "mean_token_accuracy": 0.6094680428504944, "num_tokens": 7188297.0, "step": 108 }, { "epoch": 0.44129554655870445, "grad_norm": 8.281692416649847, "learning_rate": 1.562753036437247e-05, "loss": 14.7578, "mean_token_accuracy": 0.5806804895401001, "num_tokens": 7252286.0, "step": 109 }, { "epoch": 0.44534412955465585, "grad_norm": 7.710589034957108, "learning_rate": 1.5587044534412957e-05, "loss": 13.9375, "mean_token_accuracy": 0.5891672968864441, "num_tokens": 7328970.0, "step": 110 }, { "epoch": 0.44534412955465585, "eval_loss": 1.7962499856948853, "eval_mean_token_accuracy": 0.5861413329839706, "eval_num_tokens": 7328970.0, "eval_runtime": 0.6541, "eval_samples_per_second": 305.776, "eval_steps_per_second": 6.116, "step": 110 }, { "epoch": 0.4493927125506073, "grad_norm": 9.932234036882006, "learning_rate": 1.5546558704453443e-05, "loss": 14.9844, "mean_token_accuracy": 0.583306074142456, "num_tokens": 7396220.0, "step": 111 }, { "epoch": 0.4534412955465587, "grad_norm": 7.818592875809941, "learning_rate": 1.550607287449393e-05, "loss": 14.6562, "mean_token_accuracy": 0.5830007195472717, "num_tokens": 7454663.0, "step": 112 }, { "epoch": 0.4574898785425101, "grad_norm": 8.077045865068051, "learning_rate": 1.5465587044534415e-05, "loss": 15.1641, "mean_token_accuracy": 0.5737788677215576, "num_tokens": 7515205.0, "step": 113 }, { "epoch": 0.46153846153846156, "grad_norm": 8.172258229119766, "learning_rate": 1.54251012145749e-05, "loss": 13.7969, "mean_token_accuracy": 0.5884114503860474, "num_tokens": 7571704.0, "step": 114 }, { "epoch": 0.46558704453441296, "grad_norm": 7.57672126802984, "learning_rate": 1.5384615384615387e-05, "loss": 13.6719, "mean_token_accuracy": 0.599338173866272, "num_tokens": 7634020.0, "step": 115 }, { "epoch": 0.46558704453441296, "eval_loss": 1.7934374809265137, "eval_mean_token_accuracy": 0.5860239416360855, "eval_num_tokens": 7634020.0, "eval_runtime": 0.6265, "eval_samples_per_second": 319.23, "eval_steps_per_second": 6.385, "step": 115 }, { "epoch": 0.46963562753036436, "grad_norm": 7.229443663353096, "learning_rate": 1.534412955465587e-05, "loss": 13.4922, "mean_token_accuracy": 0.5976777672767639, "num_tokens": 7702897.0, "step": 116 }, { "epoch": 0.47368421052631576, "grad_norm": 8.594596623758976, "learning_rate": 1.5303643724696356e-05, "loss": 13.7969, "mean_token_accuracy": 0.595205545425415, "num_tokens": 7761445.0, "step": 117 }, { "epoch": 0.4777327935222672, "grad_norm": 7.44049680844727, "learning_rate": 1.5263157894736846e-05, "loss": 14.4688, "mean_token_accuracy": 0.5831844210624695, "num_tokens": 7829220.0, "step": 118 }, { "epoch": 0.4817813765182186, "grad_norm": 8.010264997343702, "learning_rate": 1.5222672064777328e-05, "loss": 14.3516, "mean_token_accuracy": 0.5925936698913574, "num_tokens": 7897361.0, "step": 119 }, { "epoch": 0.48582995951417, "grad_norm": 9.11202877002977, "learning_rate": 1.5182186234817814e-05, "loss": 15.5078, "mean_token_accuracy": 0.5689979195594788, "num_tokens": 7958666.0, "step": 120 }, { "epoch": 0.48582995951417, "eval_loss": 1.786562442779541, "eval_mean_token_accuracy": 0.587049126625061, "eval_num_tokens": 7958666.0, "eval_runtime": 0.6451, "eval_samples_per_second": 310.009, "eval_steps_per_second": 6.2, "step": 120 }, { "epoch": 0.4898785425101215, "grad_norm": 8.733802509491014, "learning_rate": 1.5141700404858302e-05, "loss": 13.9922, "mean_token_accuracy": 0.591108500957489, "num_tokens": 8031901.0, "step": 121 }, { "epoch": 0.4939271255060729, "grad_norm": 7.43499030469072, "learning_rate": 1.5101214574898787e-05, "loss": 13.6719, "mean_token_accuracy": 0.6017109155654907, "num_tokens": 8105143.0, "step": 122 }, { "epoch": 0.4979757085020243, "grad_norm": 7.117038860943061, "learning_rate": 1.5060728744939273e-05, "loss": 14.5234, "mean_token_accuracy": 0.580233633518219, "num_tokens": 8177876.0, "step": 123 }, { "epoch": 0.5020242914979757, "grad_norm": 7.746503576535467, "learning_rate": 1.5020242914979759e-05, "loss": 14.6797, "mean_token_accuracy": 0.5798138380050659, "num_tokens": 8246274.0, "step": 124 }, { "epoch": 0.5060728744939271, "grad_norm": 7.532018226813427, "learning_rate": 1.4979757085020243e-05, "loss": 14.1562, "mean_token_accuracy": 0.582686722278595, "num_tokens": 8315418.0, "step": 125 }, { "epoch": 0.5060728744939271, "eval_loss": 1.7821874618530273, "eval_mean_token_accuracy": 0.5875359177589417, "eval_num_tokens": 8315418.0, "eval_runtime": 0.6323, "eval_samples_per_second": 316.295, "eval_steps_per_second": 6.326, "step": 125 }, { "epoch": 0.5101214574898786, "grad_norm": 8.864364235981503, "learning_rate": 1.493927125506073e-05, "loss": 14.4688, "mean_token_accuracy": 0.5993614792823792, "num_tokens": 8372174.0, "step": 126 }, { "epoch": 0.5141700404858299, "grad_norm": 8.95321820232742, "learning_rate": 1.4898785425101216e-05, "loss": 14.3438, "mean_token_accuracy": 0.5901056528091431, "num_tokens": 8441146.0, "step": 127 }, { "epoch": 0.5182186234817814, "grad_norm": 7.263835476936067, "learning_rate": 1.48582995951417e-05, "loss": 13.9688, "mean_token_accuracy": 0.594972550868988, "num_tokens": 8511862.0, "step": 128 }, { "epoch": 0.5222672064777328, "grad_norm": 9.946116238413886, "learning_rate": 1.4817813765182188e-05, "loss": 14.6953, "mean_token_accuracy": 0.5761271119117737, "num_tokens": 8579819.0, "step": 129 }, { "epoch": 0.5263157894736842, "grad_norm": 7.617051573886364, "learning_rate": 1.4777327935222674e-05, "loss": 12.9141, "mean_token_accuracy": 0.6136053800582886, "num_tokens": 8648885.0, "step": 130 }, { "epoch": 0.5263157894736842, "eval_loss": 1.7781250476837158, "eval_mean_token_accuracy": 0.5874989777803421, "eval_num_tokens": 8648885.0, "eval_runtime": 0.6456, "eval_samples_per_second": 309.786, "eval_steps_per_second": 6.196, "step": 130 }, { "epoch": 0.5303643724696356, "grad_norm": 7.995063941880695, "learning_rate": 1.4736842105263159e-05, "loss": 14.2891, "mean_token_accuracy": 0.5915219187736511, "num_tokens": 8716889.0, "step": 131 }, { "epoch": 0.5344129554655871, "grad_norm": 7.9292860672813665, "learning_rate": 1.4696356275303645e-05, "loss": 13.5078, "mean_token_accuracy": 0.6026769280433655, "num_tokens": 8777395.0, "step": 132 }, { "epoch": 0.5384615384615384, "grad_norm": 7.586666671628873, "learning_rate": 1.465587044534413e-05, "loss": 14.8672, "mean_token_accuracy": 0.5739372372627258, "num_tokens": 8848242.0, "step": 133 }, { "epoch": 0.5425101214574899, "grad_norm": 8.426747584830805, "learning_rate": 1.4615384615384615e-05, "loss": 14.6484, "mean_token_accuracy": 0.5826023817062378, "num_tokens": 8910211.0, "step": 134 }, { "epoch": 0.5465587044534413, "grad_norm": 8.34819038901423, "learning_rate": 1.4574898785425103e-05, "loss": 15.3672, "mean_token_accuracy": 0.5670615434646606, "num_tokens": 8976714.0, "step": 135 }, { "epoch": 0.5465587044534413, "eval_loss": 1.770937442779541, "eval_mean_token_accuracy": 0.5890246480703354, "eval_num_tokens": 8976714.0, "eval_runtime": 0.627, "eval_samples_per_second": 318.954, "eval_steps_per_second": 6.379, "step": 135 }, { "epoch": 0.5506072874493927, "grad_norm": 7.635072916228879, "learning_rate": 1.453441295546559e-05, "loss": 13.5703, "mean_token_accuracy": 0.5985166430473328, "num_tokens": 9032733.0, "step": 136 }, { "epoch": 0.5546558704453441, "grad_norm": 7.13210528504054, "learning_rate": 1.4493927125506074e-05, "loss": 14.0469, "mean_token_accuracy": 0.5941102504730225, "num_tokens": 9106960.0, "step": 137 }, { "epoch": 0.5587044534412956, "grad_norm": 7.472585203507861, "learning_rate": 1.445344129554656e-05, "loss": 13.4141, "mean_token_accuracy": 0.6021690964698792, "num_tokens": 9171843.0, "step": 138 }, { "epoch": 0.562753036437247, "grad_norm": 8.097767778322229, "learning_rate": 1.4412955465587046e-05, "loss": 14.8047, "mean_token_accuracy": 0.578671932220459, "num_tokens": 9238411.0, "step": 139 }, { "epoch": 0.5668016194331984, "grad_norm": 8.339563519464773, "learning_rate": 1.437246963562753e-05, "loss": 14.6484, "mean_token_accuracy": 0.5821104645729065, "num_tokens": 9300623.0, "step": 140 }, { "epoch": 0.5668016194331984, "eval_loss": 1.7675000429153442, "eval_mean_token_accuracy": 0.5898215025663376, "eval_num_tokens": 9300623.0, "eval_runtime": 0.6311, "eval_samples_per_second": 316.883, "eval_steps_per_second": 6.338, "step": 140 }, { "epoch": 0.5708502024291497, "grad_norm": 7.211581007077111, "learning_rate": 1.4331983805668017e-05, "loss": 14.1094, "mean_token_accuracy": 0.5875740051269531, "num_tokens": 9364199.0, "step": 141 }, { "epoch": 0.5748987854251012, "grad_norm": 8.547180559454508, "learning_rate": 1.4291497975708504e-05, "loss": 13.3594, "mean_token_accuracy": 0.6009641885757446, "num_tokens": 9427112.0, "step": 142 }, { "epoch": 0.5789473684210527, "grad_norm": 8.314592250089833, "learning_rate": 1.4251012145748989e-05, "loss": 13.6719, "mean_token_accuracy": 0.5959135890007019, "num_tokens": 9493444.0, "step": 143 }, { "epoch": 0.582995951417004, "grad_norm": 8.353437279154521, "learning_rate": 1.4210526315789475e-05, "loss": 14.4453, "mean_token_accuracy": 0.5817537307739258, "num_tokens": 9563780.0, "step": 144 }, { "epoch": 0.5870445344129555, "grad_norm": 7.105730625315736, "learning_rate": 1.4170040485829961e-05, "loss": 13.8828, "mean_token_accuracy": 0.5948812961578369, "num_tokens": 9634643.0, "step": 145 }, { "epoch": 0.5870445344129555, "eval_loss": 1.764062523841858, "eval_mean_token_accuracy": 0.589348778128624, "eval_num_tokens": 9634643.0, "eval_runtime": 0.6326, "eval_samples_per_second": 316.156, "eval_steps_per_second": 6.323, "step": 145 }, { "epoch": 0.5910931174089069, "grad_norm": 8.85529190632474, "learning_rate": 1.4129554655870446e-05, "loss": 14.9609, "mean_token_accuracy": 0.5746488571166992, "num_tokens": 9705552.0, "step": 146 }, { "epoch": 0.5951417004048583, "grad_norm": 7.404634749249628, "learning_rate": 1.4089068825910932e-05, "loss": 14.1562, "mean_token_accuracy": 0.5874021053314209, "num_tokens": 9784121.0, "step": 147 }, { "epoch": 0.5991902834008097, "grad_norm": 8.414339407163222, "learning_rate": 1.4048582995951418e-05, "loss": 14.2266, "mean_token_accuracy": 0.5838397741317749, "num_tokens": 9855434.0, "step": 148 }, { "epoch": 0.6032388663967612, "grad_norm": 8.385715636253362, "learning_rate": 1.4008097165991902e-05, "loss": 14.9531, "mean_token_accuracy": 0.580719530582428, "num_tokens": 9932270.0, "step": 149 }, { "epoch": 0.6072874493927125, "grad_norm": 8.437220430570475, "learning_rate": 1.396761133603239e-05, "loss": 13.5312, "mean_token_accuracy": 0.6000396013259888, "num_tokens": 9987882.0, "step": 150 }, { "epoch": 0.6072874493927125, "eval_loss": 1.7596875429153442, "eval_mean_token_accuracy": 0.592434361577034, "eval_num_tokens": 9987882.0, "eval_runtime": 0.6411, "eval_samples_per_second": 311.96, "eval_steps_per_second": 6.239, "step": 150 }, { "epoch": 0.611336032388664, "grad_norm": 7.3392719089419, "learning_rate": 1.3927125506072876e-05, "loss": 14.1953, "mean_token_accuracy": 0.5917255878448486, "num_tokens": 10053860.0, "step": 151 }, { "epoch": 0.6153846153846154, "grad_norm": 7.106128585719554, "learning_rate": 1.388663967611336e-05, "loss": 13.9062, "mean_token_accuracy": 0.5945723652839661, "num_tokens": 10118446.0, "step": 152 }, { "epoch": 0.6194331983805668, "grad_norm": 8.771091973478418, "learning_rate": 1.3846153846153847e-05, "loss": 13.6953, "mean_token_accuracy": 0.5965032577514648, "num_tokens": 10185658.0, "step": 153 }, { "epoch": 0.6234817813765182, "grad_norm": 7.463408517295952, "learning_rate": 1.3805668016194333e-05, "loss": 14.5234, "mean_token_accuracy": 0.5856794118881226, "num_tokens": 10256194.0, "step": 154 }, { "epoch": 0.6275303643724697, "grad_norm": 7.766925275194455, "learning_rate": 1.3765182186234817e-05, "loss": 14.4844, "mean_token_accuracy": 0.5821614861488342, "num_tokens": 10322676.0, "step": 155 }, { "epoch": 0.6275303643724697, "eval_loss": 1.7568750381469727, "eval_mean_token_accuracy": 0.593303382396698, "eval_num_tokens": 10322676.0, "eval_runtime": 0.6285, "eval_samples_per_second": 318.24, "eval_steps_per_second": 6.365, "step": 155 }, { "epoch": 0.631578947368421, "grad_norm": 11.730558732427344, "learning_rate": 1.3724696356275305e-05, "loss": 14.1797, "mean_token_accuracy": 0.590240478515625, "num_tokens": 10390182.0, "step": 156 }, { "epoch": 0.6356275303643725, "grad_norm": 6.962088111992022, "learning_rate": 1.3684210526315791e-05, "loss": 14.6328, "mean_token_accuracy": 0.5875160694122314, "num_tokens": 10482669.0, "step": 157 }, { "epoch": 0.6396761133603239, "grad_norm": 7.342884429279778, "learning_rate": 1.3643724696356277e-05, "loss": 13.4531, "mean_token_accuracy": 0.6036462783813477, "num_tokens": 10543343.0, "step": 158 }, { "epoch": 0.6437246963562753, "grad_norm": 9.243698402724284, "learning_rate": 1.3603238866396762e-05, "loss": 14.3438, "mean_token_accuracy": 0.5891345739364624, "num_tokens": 10605678.0, "step": 159 }, { "epoch": 0.6477732793522267, "grad_norm": 6.586652944449273, "learning_rate": 1.3562753036437248e-05, "loss": 13.6094, "mean_token_accuracy": 0.6069329977035522, "num_tokens": 10678928.0, "step": 160 }, { "epoch": 0.6477732793522267, "eval_loss": 1.751562476158142, "eval_mean_token_accuracy": 0.5941518694162369, "eval_num_tokens": 10678928.0, "eval_runtime": 0.6304, "eval_samples_per_second": 317.246, "eval_steps_per_second": 6.345, "step": 160 }, { "epoch": 0.6518218623481782, "grad_norm": 7.292010909500028, "learning_rate": 1.3522267206477734e-05, "loss": 15.5391, "mean_token_accuracy": 0.5630365014076233, "num_tokens": 10752497.0, "step": 161 }, { "epoch": 0.6558704453441295, "grad_norm": 8.517901596352386, "learning_rate": 1.3481781376518219e-05, "loss": 13.8203, "mean_token_accuracy": 0.606192409992218, "num_tokens": 10808888.0, "step": 162 }, { "epoch": 0.659919028340081, "grad_norm": 7.047539986657361, "learning_rate": 1.3441295546558706e-05, "loss": 12.6094, "mean_token_accuracy": 0.6234721541404724, "num_tokens": 10867941.0, "step": 163 }, { "epoch": 0.6639676113360324, "grad_norm": 8.193594708311382, "learning_rate": 1.3400809716599193e-05, "loss": 15.1562, "mean_token_accuracy": 0.5752149224281311, "num_tokens": 10945011.0, "step": 164 }, { "epoch": 0.6680161943319838, "grad_norm": 7.0437604094189386, "learning_rate": 1.3360323886639677e-05, "loss": 13.1094, "mean_token_accuracy": 0.6154734492301941, "num_tokens": 11020417.0, "step": 165 }, { "epoch": 0.6680161943319838, "eval_loss": 1.7496875524520874, "eval_mean_token_accuracy": 0.5943519324064255, "eval_num_tokens": 11020417.0, "eval_runtime": 0.6499, "eval_samples_per_second": 307.758, "eval_steps_per_second": 6.155, "step": 165 }, { "epoch": 0.6720647773279352, "grad_norm": 7.138901056236762, "learning_rate": 1.3319838056680163e-05, "loss": 13.8594, "mean_token_accuracy": 0.5956168174743652, "num_tokens": 11089883.0, "step": 166 }, { "epoch": 0.6761133603238867, "grad_norm": 8.201709305900888, "learning_rate": 1.327935222672065e-05, "loss": 13.7344, "mean_token_accuracy": 0.5976811051368713, "num_tokens": 11147819.0, "step": 167 }, { "epoch": 0.680161943319838, "grad_norm": 7.8526020934287954, "learning_rate": 1.3238866396761134e-05, "loss": 13.9766, "mean_token_accuracy": 0.5927841067314148, "num_tokens": 11219059.0, "step": 168 }, { "epoch": 0.6842105263157895, "grad_norm": 7.318409427810135, "learning_rate": 1.3198380566801622e-05, "loss": 14.3281, "mean_token_accuracy": 0.5857775211334229, "num_tokens": 11278072.0, "step": 169 }, { "epoch": 0.6882591093117408, "grad_norm": 7.389155589413203, "learning_rate": 1.3157894736842108e-05, "loss": 14.4297, "mean_token_accuracy": 0.587368905544281, "num_tokens": 11351732.0, "step": 170 }, { "epoch": 0.6882591093117408, "eval_loss": 1.746250033378601, "eval_mean_token_accuracy": 0.5935623943805695, "eval_num_tokens": 11351732.0, "eval_runtime": 0.6314, "eval_samples_per_second": 316.758, "eval_steps_per_second": 6.335, "step": 170 }, { "epoch": 0.6923076923076923, "grad_norm": 8.124816197332432, "learning_rate": 1.3117408906882592e-05, "loss": 14.1172, "mean_token_accuracy": 0.5922242999076843, "num_tokens": 11407714.0, "step": 171 }, { "epoch": 0.6963562753036437, "grad_norm": 7.965720782927913, "learning_rate": 1.3076923076923078e-05, "loss": 13.4609, "mean_token_accuracy": 0.6015889048576355, "num_tokens": 11484184.0, "step": 172 }, { "epoch": 0.7004048582995951, "grad_norm": 8.915221369950393, "learning_rate": 1.3036437246963564e-05, "loss": 14.1875, "mean_token_accuracy": 0.5905665755271912, "num_tokens": 11544099.0, "step": 173 }, { "epoch": 0.7044534412955465, "grad_norm": 8.790247334783299, "learning_rate": 1.2995951417004049e-05, "loss": 13.9219, "mean_token_accuracy": 0.5985158681869507, "num_tokens": 11606691.0, "step": 174 }, { "epoch": 0.708502024291498, "grad_norm": 9.868245626915737, "learning_rate": 1.2955465587044535e-05, "loss": 13.6172, "mean_token_accuracy": 0.5943373441696167, "num_tokens": 11672625.0, "step": 175 }, { "epoch": 0.708502024291498, "eval_loss": 1.7431249618530273, "eval_mean_token_accuracy": 0.5946680456399918, "eval_num_tokens": 11672625.0, "eval_runtime": 0.6432, "eval_samples_per_second": 310.949, "eval_steps_per_second": 6.219, "step": 175 }, { "epoch": 0.7125506072874493, "grad_norm": 8.884030860368474, "learning_rate": 1.2914979757085023e-05, "loss": 14.0, "mean_token_accuracy": 0.5949345827102661, "num_tokens": 11732625.0, "step": 176 }, { "epoch": 0.7165991902834008, "grad_norm": 9.007310004782914, "learning_rate": 1.2874493927125507e-05, "loss": 14.0156, "mean_token_accuracy": 0.6007354259490967, "num_tokens": 11801766.0, "step": 177 }, { "epoch": 0.7206477732793523, "grad_norm": 8.33208496329059, "learning_rate": 1.2834008097165993e-05, "loss": 14.8281, "mean_token_accuracy": 0.5790393352508545, "num_tokens": 11863641.0, "step": 178 }, { "epoch": 0.7246963562753036, "grad_norm": 7.426550244631567, "learning_rate": 1.279352226720648e-05, "loss": 14.5625, "mean_token_accuracy": 0.5841023325920105, "num_tokens": 11933275.0, "step": 179 }, { "epoch": 0.728744939271255, "grad_norm": 6.8034373170968525, "learning_rate": 1.2753036437246964e-05, "loss": 14.6562, "mean_token_accuracy": 0.5787296295166016, "num_tokens": 12017691.0, "step": 180 }, { "epoch": 0.728744939271255, "eval_loss": 1.7400000095367432, "eval_mean_token_accuracy": 0.5955071151256561, "eval_num_tokens": 12017691.0, "eval_runtime": 0.6493, "eval_samples_per_second": 308.043, "eval_steps_per_second": 6.161, "step": 180 }, { "epoch": 0.7327935222672065, "grad_norm": 7.057663809648343, "learning_rate": 1.271255060728745e-05, "loss": 13.6875, "mean_token_accuracy": 0.5996446013450623, "num_tokens": 12079093.0, "step": 181 }, { "epoch": 0.7368421052631579, "grad_norm": 7.371289124958891, "learning_rate": 1.2672064777327936e-05, "loss": 14.5234, "mean_token_accuracy": 0.5833817720413208, "num_tokens": 12160008.0, "step": 182 }, { "epoch": 0.7408906882591093, "grad_norm": 8.707687219264766, "learning_rate": 1.263157894736842e-05, "loss": 14.2266, "mean_token_accuracy": 0.5921141505241394, "num_tokens": 12229741.0, "step": 183 }, { "epoch": 0.7449392712550608, "grad_norm": 7.516841864927085, "learning_rate": 1.2591093117408908e-05, "loss": 14.7891, "mean_token_accuracy": 0.5818018913269043, "num_tokens": 12301362.0, "step": 184 }, { "epoch": 0.7489878542510121, "grad_norm": 7.6633569797452346, "learning_rate": 1.2550607287449395e-05, "loss": 13.6406, "mean_token_accuracy": 0.6013834476470947, "num_tokens": 12370239.0, "step": 185 }, { "epoch": 0.7489878542510121, "eval_loss": 1.7356250286102295, "eval_mean_token_accuracy": 0.5960101634263992, "eval_num_tokens": 12370239.0, "eval_runtime": 0.6504, "eval_samples_per_second": 307.492, "eval_steps_per_second": 6.15, "step": 185 }, { "epoch": 0.7530364372469636, "grad_norm": 6.968835438956393, "learning_rate": 1.2510121457489879e-05, "loss": 13.5625, "mean_token_accuracy": 0.6022695302963257, "num_tokens": 12441507.0, "step": 186 }, { "epoch": 0.757085020242915, "grad_norm": 7.195791337914372, "learning_rate": 1.2469635627530365e-05, "loss": 13.4609, "mean_token_accuracy": 0.6029326915740967, "num_tokens": 12502266.0, "step": 187 }, { "epoch": 0.7611336032388664, "grad_norm": 7.668677247147722, "learning_rate": 1.2429149797570851e-05, "loss": 13.8047, "mean_token_accuracy": 0.5992096662521362, "num_tokens": 12566099.0, "step": 188 }, { "epoch": 0.7651821862348178, "grad_norm": 7.04566661191058, "learning_rate": 1.2388663967611336e-05, "loss": 13.6875, "mean_token_accuracy": 0.6022654175758362, "num_tokens": 12634407.0, "step": 189 }, { "epoch": 0.7692307692307693, "grad_norm": 7.697888634722549, "learning_rate": 1.2348178137651824e-05, "loss": 13.5703, "mean_token_accuracy": 0.6025985479354858, "num_tokens": 12695120.0, "step": 190 }, { "epoch": 0.7692307692307693, "eval_loss": 1.734375, "eval_mean_token_accuracy": 0.5960522890090942, "eval_num_tokens": 12695120.0, "eval_runtime": 0.6494, "eval_samples_per_second": 307.96, "eval_steps_per_second": 6.159, "step": 190 }, { "epoch": 0.7732793522267206, "grad_norm": 6.785752803586286, "learning_rate": 1.230769230769231e-05, "loss": 13.2031, "mean_token_accuracy": 0.6083678603172302, "num_tokens": 12765477.0, "step": 191 }, { "epoch": 0.7773279352226721, "grad_norm": 8.429385939587196, "learning_rate": 1.2267206477732794e-05, "loss": 13.6719, "mean_token_accuracy": 0.5998564958572388, "num_tokens": 12835216.0, "step": 192 }, { "epoch": 0.7813765182186235, "grad_norm": 7.85611095251478, "learning_rate": 1.222672064777328e-05, "loss": 13.1328, "mean_token_accuracy": 0.6025688648223877, "num_tokens": 12902937.0, "step": 193 }, { "epoch": 0.7854251012145749, "grad_norm": 7.6755487929843556, "learning_rate": 1.2186234817813766e-05, "loss": 13.0703, "mean_token_accuracy": 0.6151934266090393, "num_tokens": 12958920.0, "step": 194 }, { "epoch": 0.7894736842105263, "grad_norm": 9.563095955991269, "learning_rate": 1.2145748987854251e-05, "loss": 13.75, "mean_token_accuracy": 0.5952048301696777, "num_tokens": 13019128.0, "step": 195 }, { "epoch": 0.7894736842105263, "eval_loss": 1.730625033378601, "eval_mean_token_accuracy": 0.5964938104152679, "eval_num_tokens": 13019128.0, "eval_runtime": 0.6302, "eval_samples_per_second": 317.371, "eval_steps_per_second": 6.347, "step": 195 }, { "epoch": 0.7935222672064778, "grad_norm": 7.513918136227913, "learning_rate": 1.2105263157894737e-05, "loss": 14.5703, "mean_token_accuracy": 0.5844899415969849, "num_tokens": 13085188.0, "step": 196 }, { "epoch": 0.7975708502024291, "grad_norm": 7.287761719869118, "learning_rate": 1.2064777327935225e-05, "loss": 13.6484, "mean_token_accuracy": 0.5984832048416138, "num_tokens": 13155269.0, "step": 197 }, { "epoch": 0.8016194331983806, "grad_norm": 7.383829587566862, "learning_rate": 1.202429149797571e-05, "loss": 14.5156, "mean_token_accuracy": 0.5839080214500427, "num_tokens": 13229283.0, "step": 198 }, { "epoch": 0.805668016194332, "grad_norm": 12.228642188104056, "learning_rate": 1.1983805668016195e-05, "loss": 14.0234, "mean_token_accuracy": 0.5865586400032043, "num_tokens": 13298358.0, "step": 199 }, { "epoch": 0.8097165991902834, "grad_norm": 8.01540634329265, "learning_rate": 1.1943319838056682e-05, "loss": 13.4844, "mean_token_accuracy": 0.603053867816925, "num_tokens": 13366925.0, "step": 200 }, { "epoch": 0.8097165991902834, "eval_loss": 1.7278125286102295, "eval_mean_token_accuracy": 0.5974646657705307, "eval_num_tokens": 13366925.0, "eval_runtime": 0.6458, "eval_samples_per_second": 309.671, "eval_steps_per_second": 6.193, "step": 200 }, { "epoch": 0.8137651821862348, "grad_norm": 8.670916760818496, "learning_rate": 1.1902834008097166e-05, "loss": 13.5781, "mean_token_accuracy": 0.5980917811393738, "num_tokens": 13423691.0, "step": 201 }, { "epoch": 0.8178137651821862, "grad_norm": 8.271091772585052, "learning_rate": 1.1862348178137652e-05, "loss": 13.5938, "mean_token_accuracy": 0.6054921746253967, "num_tokens": 13484423.0, "step": 202 }, { "epoch": 0.8218623481781376, "grad_norm": 7.300340294135019, "learning_rate": 1.182186234817814e-05, "loss": 13.1016, "mean_token_accuracy": 0.60859215259552, "num_tokens": 13559695.0, "step": 203 }, { "epoch": 0.8259109311740891, "grad_norm": 9.1407332883229, "learning_rate": 1.1781376518218623e-05, "loss": 14.1406, "mean_token_accuracy": 0.5899603366851807, "num_tokens": 13645280.0, "step": 204 }, { "epoch": 0.8299595141700404, "grad_norm": 9.22684063564975, "learning_rate": 1.174089068825911e-05, "loss": 13.6562, "mean_token_accuracy": 0.5998020768165588, "num_tokens": 13712046.0, "step": 205 }, { "epoch": 0.8299595141700404, "eval_loss": 1.7253124713897705, "eval_mean_token_accuracy": 0.5980972796678543, "eval_num_tokens": 13712046.0, "eval_runtime": 0.6282, "eval_samples_per_second": 318.347, "eval_steps_per_second": 6.367, "step": 205 }, { "epoch": 0.8340080971659919, "grad_norm": 7.12821183237729, "learning_rate": 1.1700404858299597e-05, "loss": 14.1094, "mean_token_accuracy": 0.591342568397522, "num_tokens": 13776702.0, "step": 206 }, { "epoch": 0.8380566801619433, "grad_norm": 8.729950014794346, "learning_rate": 1.1659919028340081e-05, "loss": 14.2266, "mean_token_accuracy": 0.5930887460708618, "num_tokens": 13836408.0, "step": 207 }, { "epoch": 0.8421052631578947, "grad_norm": 7.301044795096497, "learning_rate": 1.1619433198380567e-05, "loss": 14.4922, "mean_token_accuracy": 0.5839249491691589, "num_tokens": 13900559.0, "step": 208 }, { "epoch": 0.8461538461538461, "grad_norm": 7.369160590617854, "learning_rate": 1.1578947368421053e-05, "loss": 13.5703, "mean_token_accuracy": 0.6064897775650024, "num_tokens": 13970733.0, "step": 209 }, { "epoch": 0.8502024291497976, "grad_norm": 7.851482544759256, "learning_rate": 1.1538461538461538e-05, "loss": 13.1406, "mean_token_accuracy": 0.6058458089828491, "num_tokens": 14039496.0, "step": 210 }, { "epoch": 0.8502024291497976, "eval_loss": 1.7237499952316284, "eval_mean_token_accuracy": 0.5978213995695114, "eval_num_tokens": 14039496.0, "eval_runtime": 0.6629, "eval_samples_per_second": 301.705, "eval_steps_per_second": 6.034, "step": 210 }, { "epoch": 0.854251012145749, "grad_norm": 7.716996728334529, "learning_rate": 1.1497975708502026e-05, "loss": 13.8281, "mean_token_accuracy": 0.5954342484474182, "num_tokens": 14102200.0, "step": 211 }, { "epoch": 0.8582995951417004, "grad_norm": 8.015987460003101, "learning_rate": 1.1457489878542512e-05, "loss": 14.3672, "mean_token_accuracy": 0.5886507034301758, "num_tokens": 14170374.0, "step": 212 }, { "epoch": 0.8623481781376519, "grad_norm": 7.8139654343716565, "learning_rate": 1.1417004048582996e-05, "loss": 14.0312, "mean_token_accuracy": 0.5954753756523132, "num_tokens": 14231040.0, "step": 213 }, { "epoch": 0.8663967611336032, "grad_norm": 7.595765550190981, "learning_rate": 1.1376518218623482e-05, "loss": 13.9375, "mean_token_accuracy": 0.5962504744529724, "num_tokens": 14296979.0, "step": 214 }, { "epoch": 0.8704453441295547, "grad_norm": 8.046423435434846, "learning_rate": 1.1336032388663969e-05, "loss": 14.6328, "mean_token_accuracy": 0.5841693878173828, "num_tokens": 14372997.0, "step": 215 }, { "epoch": 0.8704453441295547, "eval_loss": 1.720312476158142, "eval_mean_token_accuracy": 0.5980706065893173, "eval_num_tokens": 14372997.0, "eval_runtime": 0.6391, "eval_samples_per_second": 312.957, "eval_steps_per_second": 6.259, "step": 215 }, { "epoch": 0.8744939271255061, "grad_norm": 8.7705422913405, "learning_rate": 1.1295546558704453e-05, "loss": 14.0859, "mean_token_accuracy": 0.5949749946594238, "num_tokens": 14438573.0, "step": 216 }, { "epoch": 0.8785425101214575, "grad_norm": 7.832794988425434, "learning_rate": 1.1255060728744939e-05, "loss": 13.1484, "mean_token_accuracy": 0.613569438457489, "num_tokens": 14501484.0, "step": 217 }, { "epoch": 0.8825910931174089, "grad_norm": 7.669341614248951, "learning_rate": 1.1214574898785427e-05, "loss": 14.7031, "mean_token_accuracy": 0.5858564972877502, "num_tokens": 14574868.0, "step": 218 }, { "epoch": 0.8866396761133604, "grad_norm": 7.973334988452109, "learning_rate": 1.1174089068825913e-05, "loss": 13.8047, "mean_token_accuracy": 0.5929080247879028, "num_tokens": 14635874.0, "step": 219 }, { "epoch": 0.8906882591093117, "grad_norm": 9.162769594031285, "learning_rate": 1.1133603238866398e-05, "loss": 14.6016, "mean_token_accuracy": 0.5782532095909119, "num_tokens": 14698568.0, "step": 220 }, { "epoch": 0.8906882591093117, "eval_loss": 1.7178125381469727, "eval_mean_token_accuracy": 0.5981580018997192, "eval_num_tokens": 14698568.0, "eval_runtime": 0.6351, "eval_samples_per_second": 314.936, "eval_steps_per_second": 6.299, "step": 220 }, { "epoch": 0.8947368421052632, "grad_norm": 7.206939771632334, "learning_rate": 1.1093117408906884e-05, "loss": 13.6406, "mean_token_accuracy": 0.5951396226882935, "num_tokens": 14762701.0, "step": 221 }, { "epoch": 0.8987854251012146, "grad_norm": 8.38441745268265, "learning_rate": 1.105263157894737e-05, "loss": 14.3203, "mean_token_accuracy": 0.5884565114974976, "num_tokens": 14836157.0, "step": 222 }, { "epoch": 0.902834008097166, "grad_norm": 7.099567588103198, "learning_rate": 1.1012145748987854e-05, "loss": 13.6875, "mean_token_accuracy": 0.6000562310218811, "num_tokens": 14903783.0, "step": 223 }, { "epoch": 0.9068825910931174, "grad_norm": 7.639650741767317, "learning_rate": 1.0971659919028342e-05, "loss": 13.1719, "mean_token_accuracy": 0.6127068996429443, "num_tokens": 14964145.0, "step": 224 }, { "epoch": 0.9109311740890689, "grad_norm": 7.438961030040257, "learning_rate": 1.0931174089068828e-05, "loss": 14.3281, "mean_token_accuracy": 0.5837487578392029, "num_tokens": 15042849.0, "step": 225 }, { "epoch": 0.9109311740890689, "eval_loss": 1.7165625095367432, "eval_mean_token_accuracy": 0.5983149856328964, "eval_num_tokens": 15042849.0, "eval_runtime": 0.6312, "eval_samples_per_second": 316.878, "eval_steps_per_second": 6.338, "step": 225 }, { "epoch": 0.9149797570850202, "grad_norm": 8.491938043424565, "learning_rate": 1.0890688259109313e-05, "loss": 13.75, "mean_token_accuracy": 0.5973576903343201, "num_tokens": 15107705.0, "step": 226 }, { "epoch": 0.9190283400809717, "grad_norm": 7.63291712789061, "learning_rate": 1.0850202429149799e-05, "loss": 13.9375, "mean_token_accuracy": 0.5994954109191895, "num_tokens": 15179511.0, "step": 227 }, { "epoch": 0.9230769230769231, "grad_norm": 7.056222313978797, "learning_rate": 1.0809716599190285e-05, "loss": 13.75, "mean_token_accuracy": 0.6037131547927856, "num_tokens": 15240170.0, "step": 228 }, { "epoch": 0.9271255060728745, "grad_norm": 7.894406360447493, "learning_rate": 1.076923076923077e-05, "loss": 13.4609, "mean_token_accuracy": 0.5994337201118469, "num_tokens": 15300626.0, "step": 229 }, { "epoch": 0.9311740890688259, "grad_norm": 8.960700403802914, "learning_rate": 1.0728744939271255e-05, "loss": 13.7578, "mean_token_accuracy": 0.5973832607269287, "num_tokens": 15366420.0, "step": 230 }, { "epoch": 0.9311740890688259, "eval_loss": 1.713437557220459, "eval_mean_token_accuracy": 0.5988392233848572, "eval_num_tokens": 15366420.0, "eval_runtime": 0.6821, "eval_samples_per_second": 293.227, "eval_steps_per_second": 5.865, "step": 230 }, { "epoch": 0.9352226720647774, "grad_norm": 8.458253193834494, "learning_rate": 1.0688259109311743e-05, "loss": 12.6172, "mean_token_accuracy": 0.6170424222946167, "num_tokens": 15421793.0, "step": 231 }, { "epoch": 0.9392712550607287, "grad_norm": 7.0686015225039665, "learning_rate": 1.0647773279352228e-05, "loss": 13.5625, "mean_token_accuracy": 0.607692301273346, "num_tokens": 15485817.0, "step": 232 }, { "epoch": 0.9433198380566802, "grad_norm": 7.576993960188947, "learning_rate": 1.0607287449392714e-05, "loss": 13.8984, "mean_token_accuracy": 0.5907003879547119, "num_tokens": 15564078.0, "step": 233 }, { "epoch": 0.9473684210526315, "grad_norm": 6.97349951130989, "learning_rate": 1.05668016194332e-05, "loss": 13.2969, "mean_token_accuracy": 0.601026713848114, "num_tokens": 15633099.0, "step": 234 }, { "epoch": 0.951417004048583, "grad_norm": 7.03291868031142, "learning_rate": 1.0526315789473684e-05, "loss": 13.9062, "mean_token_accuracy": 0.5920431613922119, "num_tokens": 15710279.0, "step": 235 }, { "epoch": 0.951417004048583, "eval_loss": 1.7118749618530273, "eval_mean_token_accuracy": 0.5990229398012161, "eval_num_tokens": 15710279.0, "eval_runtime": 0.6566, "eval_samples_per_second": 304.588, "eval_steps_per_second": 6.092, "step": 235 }, { "epoch": 0.9554655870445344, "grad_norm": 6.669888447299149, "learning_rate": 1.048582995951417e-05, "loss": 14.1641, "mean_token_accuracy": 0.5870757102966309, "num_tokens": 15788970.0, "step": 236 }, { "epoch": 0.9595141700404858, "grad_norm": 7.4727551311500635, "learning_rate": 1.0445344129554658e-05, "loss": 13.3203, "mean_token_accuracy": 0.6026445031166077, "num_tokens": 15854074.0, "step": 237 }, { "epoch": 0.9635627530364372, "grad_norm": 7.907361177198574, "learning_rate": 1.0404858299595141e-05, "loss": 12.4453, "mean_token_accuracy": 0.6250936388969421, "num_tokens": 15915540.0, "step": 238 }, { "epoch": 0.9676113360323887, "grad_norm": 8.43850635314641, "learning_rate": 1.0364372469635629e-05, "loss": 13.1562, "mean_token_accuracy": 0.6030080318450928, "num_tokens": 15976973.0, "step": 239 }, { "epoch": 0.97165991902834, "grad_norm": 7.4122240402160875, "learning_rate": 1.0323886639676115e-05, "loss": 13.0938, "mean_token_accuracy": 0.6093297004699707, "num_tokens": 16032923.0, "step": 240 }, { "epoch": 0.97165991902834, "eval_loss": 1.709375023841858, "eval_mean_token_accuracy": 0.5996900647878647, "eval_num_tokens": 16032923.0, "eval_runtime": 0.671, "eval_samples_per_second": 298.041, "eval_steps_per_second": 5.961, "step": 240 }, { "epoch": 0.9757085020242915, "grad_norm": 7.354557905255342, "learning_rate": 1.02834008097166e-05, "loss": 14.0156, "mean_token_accuracy": 0.6022666692733765, "num_tokens": 16094928.0, "step": 241 }, { "epoch": 0.979757085020243, "grad_norm": 6.816456710472039, "learning_rate": 1.0242914979757086e-05, "loss": 13.2266, "mean_token_accuracy": 0.6128579378128052, "num_tokens": 16164039.0, "step": 242 }, { "epoch": 0.9838056680161943, "grad_norm": 7.422654481239377, "learning_rate": 1.0202429149797572e-05, "loss": 14.2344, "mean_token_accuracy": 0.5937775373458862, "num_tokens": 16241339.0, "step": 243 }, { "epoch": 0.9878542510121457, "grad_norm": 6.404505160904721, "learning_rate": 1.0161943319838056e-05, "loss": 12.7969, "mean_token_accuracy": 0.6247119903564453, "num_tokens": 16315626.0, "step": 244 }, { "epoch": 0.9919028340080972, "grad_norm": 6.767676657987019, "learning_rate": 1.0121457489878544e-05, "loss": 13.0391, "mean_token_accuracy": 0.6165784597396851, "num_tokens": 16380702.0, "step": 245 }, { "epoch": 0.9919028340080972, "eval_loss": 1.7059375047683716, "eval_mean_token_accuracy": 0.5998604446649551, "eval_num_tokens": 16380702.0, "eval_runtime": 0.6318, "eval_samples_per_second": 316.572, "eval_steps_per_second": 6.331, "step": 245 }, { "epoch": 0.9959514170040485, "grad_norm": 8.40463068782111, "learning_rate": 1.008097165991903e-05, "loss": 13.5469, "mean_token_accuracy": 0.6080746054649353, "num_tokens": 16451754.0, "step": 246 }, { "epoch": 1.0, "grad_norm": 8.465098958585465, "learning_rate": 1.0040485829959515e-05, "loss": 13.3047, "mean_token_accuracy": 0.609171450138092, "num_tokens": 16516606.0, "step": 247 }, { "epoch": 1.0040485829959513, "grad_norm": 6.50385357417948, "learning_rate": 1e-05, "loss": 13.4297, "mean_token_accuracy": 0.6065150499343872, "num_tokens": 16584820.0, "step": 248 }, { "epoch": 1.008097165991903, "grad_norm": 7.977746144483265, "learning_rate": 9.959514170040487e-06, "loss": 13.5391, "mean_token_accuracy": 0.6004602313041687, "num_tokens": 16652244.0, "step": 249 }, { "epoch": 1.0121457489878543, "grad_norm": 7.116151586707309, "learning_rate": 9.919028340080973e-06, "loss": 13.3516, "mean_token_accuracy": 0.599148154258728, "num_tokens": 16715698.0, "step": 250 }, { "epoch": 1.0121457489878543, "eval_loss": 1.704687476158142, "eval_mean_token_accuracy": 0.5997532159090042, "eval_num_tokens": 16715698.0, "eval_runtime": 0.6344, "eval_samples_per_second": 315.241, "eval_steps_per_second": 6.305, "step": 250 }, { "epoch": 1.0161943319838056, "grad_norm": 7.204779568487317, "learning_rate": 9.878542510121458e-06, "loss": 13.4922, "mean_token_accuracy": 0.6105347275733948, "num_tokens": 16782076.0, "step": 251 }, { "epoch": 1.0202429149797572, "grad_norm": 6.934398308018714, "learning_rate": 9.838056680161944e-06, "loss": 12.2969, "mean_token_accuracy": 0.6307468414306641, "num_tokens": 16844000.0, "step": 252 }, { "epoch": 1.0242914979757085, "grad_norm": 6.262390941001453, "learning_rate": 9.79757085020243e-06, "loss": 12.1875, "mean_token_accuracy": 0.6304177045822144, "num_tokens": 16913997.0, "step": 253 }, { "epoch": 1.0283400809716599, "grad_norm": 7.26353805403556, "learning_rate": 9.757085020242916e-06, "loss": 13.0234, "mean_token_accuracy": 0.6181889176368713, "num_tokens": 16975505.0, "step": 254 }, { "epoch": 1.0323886639676114, "grad_norm": 7.253371839731625, "learning_rate": 9.7165991902834e-06, "loss": 12.9297, "mean_token_accuracy": 0.6192678809165955, "num_tokens": 17045831.0, "step": 255 }, { "epoch": 1.0323886639676114, "eval_loss": 1.7037500143051147, "eval_mean_token_accuracy": 0.600011944770813, "eval_num_tokens": 17045831.0, "eval_runtime": 0.6321, "eval_samples_per_second": 316.386, "eval_steps_per_second": 6.328, "step": 255 }, { "epoch": 1.0364372469635628, "grad_norm": 7.103579854054674, "learning_rate": 9.676113360323888e-06, "loss": 13.3125, "mean_token_accuracy": 0.6100465655326843, "num_tokens": 17114834.0, "step": 256 }, { "epoch": 1.040485829959514, "grad_norm": 7.116930652514039, "learning_rate": 9.635627530364373e-06, "loss": 13.9219, "mean_token_accuracy": 0.5969142317771912, "num_tokens": 17185739.0, "step": 257 }, { "epoch": 1.0445344129554657, "grad_norm": 7.377322519725867, "learning_rate": 9.595141700404859e-06, "loss": 12.9922, "mean_token_accuracy": 0.6139745712280273, "num_tokens": 17239157.0, "step": 258 }, { "epoch": 1.048582995951417, "grad_norm": 6.975650481941647, "learning_rate": 9.554655870445345e-06, "loss": 13.4297, "mean_token_accuracy": 0.6126266121864319, "num_tokens": 17303593.0, "step": 259 }, { "epoch": 1.0526315789473684, "grad_norm": 6.359499519445163, "learning_rate": 9.514170040485831e-06, "loss": 13.2734, "mean_token_accuracy": 0.6133294105529785, "num_tokens": 17368146.0, "step": 260 }, { "epoch": 1.0526315789473684, "eval_loss": 1.703125, "eval_mean_token_accuracy": 0.6001382917165756, "eval_num_tokens": 17368146.0, "eval_runtime": 0.6446, "eval_samples_per_second": 310.258, "eval_steps_per_second": 6.205, "step": 260 }, { "epoch": 1.05668016194332, "grad_norm": 6.671493038575028, "learning_rate": 9.473684210526315e-06, "loss": 12.0781, "mean_token_accuracy": 0.6352812647819519, "num_tokens": 17428599.0, "step": 261 }, { "epoch": 1.0607287449392713, "grad_norm": 7.378756810172295, "learning_rate": 9.433198380566803e-06, "loss": 12.2734, "mean_token_accuracy": 0.6308179497718811, "num_tokens": 17491185.0, "step": 262 }, { "epoch": 1.0647773279352226, "grad_norm": 7.720411271822566, "learning_rate": 9.392712550607288e-06, "loss": 13.9922, "mean_token_accuracy": 0.5976829528808594, "num_tokens": 17558922.0, "step": 263 }, { "epoch": 1.0688259109311742, "grad_norm": 7.148823701274059, "learning_rate": 9.352226720647774e-06, "loss": 12.6562, "mean_token_accuracy": 0.6221409440040588, "num_tokens": 17625704.0, "step": 264 }, { "epoch": 1.0728744939271255, "grad_norm": 6.952211161763132, "learning_rate": 9.31174089068826e-06, "loss": 13.0, "mean_token_accuracy": 0.6169036030769348, "num_tokens": 17693375.0, "step": 265 }, { "epoch": 1.0728744939271255, "eval_loss": 1.7009375095367432, "eval_mean_token_accuracy": 0.6005319058895111, "eval_num_tokens": 17693375.0, "eval_runtime": 0.6522, "eval_samples_per_second": 306.677, "eval_steps_per_second": 6.134, "step": 265 }, { "epoch": 1.0769230769230769, "grad_norm": 7.532405610943497, "learning_rate": 9.271255060728746e-06, "loss": 13.4609, "mean_token_accuracy": 0.6039229035377502, "num_tokens": 17756505.0, "step": 266 }, { "epoch": 1.0809716599190284, "grad_norm": 6.463674669796744, "learning_rate": 9.230769230769232e-06, "loss": 12.7422, "mean_token_accuracy": 0.6189759969711304, "num_tokens": 17827037.0, "step": 267 }, { "epoch": 1.0850202429149798, "grad_norm": 7.120027379484065, "learning_rate": 9.190283400809717e-06, "loss": 13.7656, "mean_token_accuracy": 0.5981738567352295, "num_tokens": 17892704.0, "step": 268 }, { "epoch": 1.0890688259109311, "grad_norm": 6.2456984016212065, "learning_rate": 9.149797570850203e-06, "loss": 12.7344, "mean_token_accuracy": 0.6219497919082642, "num_tokens": 17968213.0, "step": 269 }, { "epoch": 1.0931174089068827, "grad_norm": 6.297910754288368, "learning_rate": 9.109311740890689e-06, "loss": 12.7188, "mean_token_accuracy": 0.6200737357139587, "num_tokens": 18039604.0, "step": 270 }, { "epoch": 1.0931174089068827, "eval_loss": 1.7000000476837158, "eval_mean_token_accuracy": 0.6004240810871124, "eval_num_tokens": 18039604.0, "eval_runtime": 0.6437, "eval_samples_per_second": 310.685, "eval_steps_per_second": 6.214, "step": 270 }, { "epoch": 1.097165991902834, "grad_norm": 7.198610948868953, "learning_rate": 9.068825910931175e-06, "loss": 12.2812, "mean_token_accuracy": 0.6254372000694275, "num_tokens": 18096278.0, "step": 271 }, { "epoch": 1.1012145748987854, "grad_norm": 6.306684772679187, "learning_rate": 9.02834008097166e-06, "loss": 13.7031, "mean_token_accuracy": 0.6017244458198547, "num_tokens": 18175556.0, "step": 272 }, { "epoch": 1.1052631578947367, "grad_norm": 6.50769292978356, "learning_rate": 8.987854251012147e-06, "loss": 12.2578, "mean_token_accuracy": 0.6318495273590088, "num_tokens": 18243307.0, "step": 273 }, { "epoch": 1.1093117408906883, "grad_norm": 6.913532871754586, "learning_rate": 8.947368421052632e-06, "loss": 12.9922, "mean_token_accuracy": 0.61337810754776, "num_tokens": 18309195.0, "step": 274 }, { "epoch": 1.1133603238866396, "grad_norm": 6.884920896555758, "learning_rate": 8.906882591093118e-06, "loss": 13.875, "mean_token_accuracy": 0.6053746342658997, "num_tokens": 18376761.0, "step": 275 }, { "epoch": 1.1133603238866396, "eval_loss": 1.6981250047683716, "eval_mean_token_accuracy": 0.6006451994180679, "eval_num_tokens": 18376761.0, "eval_runtime": 0.6441, "eval_samples_per_second": 310.493, "eval_steps_per_second": 6.21, "step": 275 }, { "epoch": 1.117408906882591, "grad_norm": 7.596057850426039, "learning_rate": 8.866396761133604e-06, "loss": 13.5703, "mean_token_accuracy": 0.5959470868110657, "num_tokens": 18438213.0, "step": 276 }, { "epoch": 1.1214574898785425, "grad_norm": 7.244460879105897, "learning_rate": 8.82591093117409e-06, "loss": 13.6094, "mean_token_accuracy": 0.6075268983840942, "num_tokens": 18508957.0, "step": 277 }, { "epoch": 1.125506072874494, "grad_norm": 7.695579384649985, "learning_rate": 8.785425101214575e-06, "loss": 13.1641, "mean_token_accuracy": 0.6176554560661316, "num_tokens": 18565083.0, "step": 278 }, { "epoch": 1.1295546558704452, "grad_norm": 6.522870352716677, "learning_rate": 8.744939271255063e-06, "loss": 12.6953, "mean_token_accuracy": 0.6225432753562927, "num_tokens": 18633531.0, "step": 279 }, { "epoch": 1.1336032388663968, "grad_norm": 7.604533792660402, "learning_rate": 8.704453441295547e-06, "loss": 12.4688, "mean_token_accuracy": 0.6240532398223877, "num_tokens": 18689313.0, "step": 280 }, { "epoch": 1.1336032388663968, "eval_loss": 1.6974999904632568, "eval_mean_token_accuracy": 0.6007698774337769, "eval_num_tokens": 18689313.0, "eval_runtime": 0.6368, "eval_samples_per_second": 314.051, "eval_steps_per_second": 6.281, "step": 280 }, { "epoch": 1.1376518218623481, "grad_norm": 6.192506768790122, "learning_rate": 8.663967611336033e-06, "loss": 13.0312, "mean_token_accuracy": 0.6120434999465942, "num_tokens": 18756467.0, "step": 281 }, { "epoch": 1.1417004048582995, "grad_norm": 5.735298339214011, "learning_rate": 8.62348178137652e-06, "loss": 12.3672, "mean_token_accuracy": 0.6290688514709473, "num_tokens": 18830631.0, "step": 282 }, { "epoch": 1.145748987854251, "grad_norm": 6.8807292772906985, "learning_rate": 8.582995951417005e-06, "loss": 12.9688, "mean_token_accuracy": 0.6203005313873291, "num_tokens": 18902033.0, "step": 283 }, { "epoch": 1.1497975708502024, "grad_norm": 6.299127839988924, "learning_rate": 8.54251012145749e-06, "loss": 11.7344, "mean_token_accuracy": 0.6383894085884094, "num_tokens": 18965552.0, "step": 284 }, { "epoch": 1.1538461538461537, "grad_norm": 7.602932263969704, "learning_rate": 8.502024291497976e-06, "loss": 13.7656, "mean_token_accuracy": 0.6015710830688477, "num_tokens": 19032067.0, "step": 285 }, { "epoch": 1.1538461538461537, "eval_loss": 1.6959375143051147, "eval_mean_token_accuracy": 0.6004112809896469, "eval_num_tokens": 19032067.0, "eval_runtime": 0.6551, "eval_samples_per_second": 305.283, "eval_steps_per_second": 6.106, "step": 285 }, { "epoch": 1.1578947368421053, "grad_norm": 6.900267004886904, "learning_rate": 8.461538461538462e-06, "loss": 12.1562, "mean_token_accuracy": 0.6340711116790771, "num_tokens": 19091771.0, "step": 286 }, { "epoch": 1.1619433198380567, "grad_norm": 7.015232796047964, "learning_rate": 8.421052631578948e-06, "loss": 12.7578, "mean_token_accuracy": 0.6194669604301453, "num_tokens": 19160875.0, "step": 287 }, { "epoch": 1.165991902834008, "grad_norm": 8.07776854225312, "learning_rate": 8.380566801619434e-06, "loss": 12.8672, "mean_token_accuracy": 0.621356189250946, "num_tokens": 19234248.0, "step": 288 }, { "epoch": 1.1700404858299596, "grad_norm": 6.886729565425438, "learning_rate": 8.340080971659919e-06, "loss": 13.3125, "mean_token_accuracy": 0.6096766591072083, "num_tokens": 19300037.0, "step": 289 }, { "epoch": 1.174089068825911, "grad_norm": 7.451714517536367, "learning_rate": 8.299595141700405e-06, "loss": 12.9062, "mean_token_accuracy": 0.6140288710594177, "num_tokens": 19359507.0, "step": 290 }, { "epoch": 1.174089068825911, "eval_loss": 1.6950000524520874, "eval_mean_token_accuracy": 0.600911945104599, "eval_num_tokens": 19359507.0, "eval_runtime": 0.6622, "eval_samples_per_second": 302.012, "eval_steps_per_second": 6.04, "step": 290 }, { "epoch": 1.1781376518218623, "grad_norm": 6.19859060055071, "learning_rate": 8.259109311740891e-06, "loss": 12.9922, "mean_token_accuracy": 0.6148094534873962, "num_tokens": 19426042.0, "step": 291 }, { "epoch": 1.1821862348178138, "grad_norm": 6.3346510938910345, "learning_rate": 8.218623481781377e-06, "loss": 12.8516, "mean_token_accuracy": 0.615065336227417, "num_tokens": 19494767.0, "step": 292 }, { "epoch": 1.1862348178137652, "grad_norm": 7.185444383014579, "learning_rate": 8.178137651821862e-06, "loss": 12.8281, "mean_token_accuracy": 0.6221780776977539, "num_tokens": 19561762.0, "step": 293 }, { "epoch": 1.1902834008097165, "grad_norm": 7.763951709420421, "learning_rate": 8.13765182186235e-06, "loss": 13.5312, "mean_token_accuracy": 0.6010787487030029, "num_tokens": 19629313.0, "step": 294 }, { "epoch": 1.194331983805668, "grad_norm": 6.6829771026826785, "learning_rate": 8.097165991902834e-06, "loss": 12.6875, "mean_token_accuracy": 0.6254391074180603, "num_tokens": 19699694.0, "step": 295 }, { "epoch": 1.194331983805668, "eval_loss": 1.6931250095367432, "eval_mean_token_accuracy": 0.6008877456188202, "eval_num_tokens": 19699694.0, "eval_runtime": 0.6362, "eval_samples_per_second": 314.38, "eval_steps_per_second": 6.288, "step": 295 }, { "epoch": 1.1983805668016194, "grad_norm": 6.897120986568694, "learning_rate": 8.056680161943322e-06, "loss": 12.2109, "mean_token_accuracy": 0.6307302713394165, "num_tokens": 19757434.0, "step": 296 }, { "epoch": 1.2024291497975708, "grad_norm": 7.44384813842545, "learning_rate": 8.016194331983806e-06, "loss": 12.7656, "mean_token_accuracy": 0.6187056303024292, "num_tokens": 19820570.0, "step": 297 }, { "epoch": 1.2064777327935223, "grad_norm": 6.4203676546627015, "learning_rate": 7.975708502024292e-06, "loss": 13.2812, "mean_token_accuracy": 0.6042314767837524, "num_tokens": 19890728.0, "step": 298 }, { "epoch": 1.2105263157894737, "grad_norm": 6.848564846162855, "learning_rate": 7.935222672064778e-06, "loss": 13.0703, "mean_token_accuracy": 0.6080402731895447, "num_tokens": 19959322.0, "step": 299 }, { "epoch": 1.214574898785425, "grad_norm": 6.862334865838493, "learning_rate": 7.894736842105265e-06, "loss": 13.3516, "mean_token_accuracy": 0.6005032062530518, "num_tokens": 20031722.0, "step": 300 }, { "epoch": 1.214574898785425, "eval_loss": 1.691562533378601, "eval_mean_token_accuracy": 0.6006248891353607, "eval_num_tokens": 20031722.0, "eval_runtime": 0.6358, "eval_samples_per_second": 314.55, "eval_steps_per_second": 6.291, "step": 300 }, { "epoch": 1.2186234817813766, "grad_norm": 7.001796551105443, "learning_rate": 7.854251012145749e-06, "loss": 12.7109, "mean_token_accuracy": 0.6257545948028564, "num_tokens": 20104837.0, "step": 301 }, { "epoch": 1.222672064777328, "grad_norm": 7.289919019857029, "learning_rate": 7.813765182186235e-06, "loss": 12.8828, "mean_token_accuracy": 0.6084967851638794, "num_tokens": 20172668.0, "step": 302 }, { "epoch": 1.2267206477732793, "grad_norm": 6.345630914099327, "learning_rate": 7.773279352226721e-06, "loss": 12.9141, "mean_token_accuracy": 0.6153846383094788, "num_tokens": 20241372.0, "step": 303 }, { "epoch": 1.2307692307692308, "grad_norm": 7.5520644152330245, "learning_rate": 7.732793522267207e-06, "loss": 13.6797, "mean_token_accuracy": 0.607166588306427, "num_tokens": 20310730.0, "step": 304 }, { "epoch": 1.2348178137651822, "grad_norm": 8.642720338892964, "learning_rate": 7.692307692307694e-06, "loss": 12.9219, "mean_token_accuracy": 0.6174276471138, "num_tokens": 20378755.0, "step": 305 }, { "epoch": 1.2348178137651822, "eval_loss": 1.6906249523162842, "eval_mean_token_accuracy": 0.601470410823822, "eval_num_tokens": 20378755.0, "eval_runtime": 0.6338, "eval_samples_per_second": 315.573, "eval_steps_per_second": 6.311, "step": 305 }, { "epoch": 1.2388663967611335, "grad_norm": 6.901290072516897, "learning_rate": 7.651821862348178e-06, "loss": 13.1094, "mean_token_accuracy": 0.6087667942047119, "num_tokens": 20449130.0, "step": 306 }, { "epoch": 1.242914979757085, "grad_norm": 5.8437165539153995, "learning_rate": 7.611336032388664e-06, "loss": 12.1797, "mean_token_accuracy": 0.6274487972259521, "num_tokens": 20511878.0, "step": 307 }, { "epoch": 1.2469635627530364, "grad_norm": 6.275119502029468, "learning_rate": 7.570850202429151e-06, "loss": 13.0, "mean_token_accuracy": 0.6140267252922058, "num_tokens": 20580939.0, "step": 308 }, { "epoch": 1.2510121457489878, "grad_norm": 6.632528297788201, "learning_rate": 7.5303643724696364e-06, "loss": 12.2969, "mean_token_accuracy": 0.6276517510414124, "num_tokens": 20646337.0, "step": 309 }, { "epoch": 1.2550607287449393, "grad_norm": 6.802264706016231, "learning_rate": 7.489878542510122e-06, "loss": 12.5312, "mean_token_accuracy": 0.6189518570899963, "num_tokens": 20706089.0, "step": 310 }, { "epoch": 1.2550607287449393, "eval_loss": 1.6903125047683716, "eval_mean_token_accuracy": 0.6015071421861649, "eval_num_tokens": 20706089.0, "eval_runtime": 0.6486, "eval_samples_per_second": 308.342, "eval_steps_per_second": 6.167, "step": 310 }, { "epoch": 1.2591093117408907, "grad_norm": 6.618283699689559, "learning_rate": 7.449392712550608e-06, "loss": 12.2422, "mean_token_accuracy": 0.6300427317619324, "num_tokens": 20776353.0, "step": 311 }, { "epoch": 1.263157894736842, "grad_norm": 6.385834652503417, "learning_rate": 7.408906882591094e-06, "loss": 13.1641, "mean_token_accuracy": 0.6120158433914185, "num_tokens": 20839051.0, "step": 312 }, { "epoch": 1.2672064777327936, "grad_norm": 6.604656501463769, "learning_rate": 7.368421052631579e-06, "loss": 12.3516, "mean_token_accuracy": 0.6265976428985596, "num_tokens": 20907888.0, "step": 313 }, { "epoch": 1.271255060728745, "grad_norm": 6.333208429557312, "learning_rate": 7.327935222672065e-06, "loss": 12.2422, "mean_token_accuracy": 0.6266438961029053, "num_tokens": 20973573.0, "step": 314 }, { "epoch": 1.2753036437246963, "grad_norm": 6.851872002967978, "learning_rate": 7.2874493927125516e-06, "loss": 13.3438, "mean_token_accuracy": 0.6068040132522583, "num_tokens": 21041626.0, "step": 315 }, { "epoch": 1.2753036437246963, "eval_loss": 1.690000057220459, "eval_mean_token_accuracy": 0.6009943783283234, "eval_num_tokens": 21041626.0, "eval_runtime": 0.6597, "eval_samples_per_second": 303.162, "eval_steps_per_second": 6.063, "step": 315 }, { "epoch": 1.2793522267206479, "grad_norm": 7.312872948522801, "learning_rate": 7.246963562753037e-06, "loss": 13.3906, "mean_token_accuracy": 0.6119928359985352, "num_tokens": 21109864.0, "step": 316 }, { "epoch": 1.2834008097165992, "grad_norm": 7.258406742426028, "learning_rate": 7.206477732793523e-06, "loss": 13.6797, "mean_token_accuracy": 0.6022233963012695, "num_tokens": 21180631.0, "step": 317 }, { "epoch": 1.2874493927125505, "grad_norm": 7.236283798071484, "learning_rate": 7.165991902834008e-06, "loss": 13.5156, "mean_token_accuracy": 0.607459545135498, "num_tokens": 21249144.0, "step": 318 }, { "epoch": 1.291497975708502, "grad_norm": 6.443354672755848, "learning_rate": 7.125506072874494e-06, "loss": 12.5156, "mean_token_accuracy": 0.6236591339111328, "num_tokens": 21315677.0, "step": 319 }, { "epoch": 1.2955465587044535, "grad_norm": 6.703854250267342, "learning_rate": 7.0850202429149805e-06, "loss": 12.0312, "mean_token_accuracy": 0.6358534693717957, "num_tokens": 21370804.0, "step": 320 }, { "epoch": 1.2955465587044535, "eval_loss": 1.6896874904632568, "eval_mean_token_accuracy": 0.6014674603939056, "eval_num_tokens": 21370804.0, "eval_runtime": 0.6354, "eval_samples_per_second": 314.742, "eval_steps_per_second": 6.295, "step": 320 }, { "epoch": 1.2995951417004048, "grad_norm": 6.223799662239967, "learning_rate": 7.044534412955466e-06, "loss": 13.5234, "mean_token_accuracy": 0.6001178622245789, "num_tokens": 21442134.0, "step": 321 }, { "epoch": 1.3036437246963564, "grad_norm": 6.245172102328749, "learning_rate": 7.004048582995951e-06, "loss": 13.3438, "mean_token_accuracy": 0.6050729751586914, "num_tokens": 21505987.0, "step": 322 }, { "epoch": 1.3076923076923077, "grad_norm": 7.640593911902389, "learning_rate": 6.963562753036438e-06, "loss": 13.125, "mean_token_accuracy": 0.614285945892334, "num_tokens": 21571444.0, "step": 323 }, { "epoch": 1.311740890688259, "grad_norm": 6.182617066137327, "learning_rate": 6.923076923076923e-06, "loss": 12.9219, "mean_token_accuracy": 0.6171441078186035, "num_tokens": 21631622.0, "step": 324 }, { "epoch": 1.3157894736842106, "grad_norm": 6.612642501099769, "learning_rate": 6.882591093117409e-06, "loss": 13.1328, "mean_token_accuracy": 0.6197838187217712, "num_tokens": 21699600.0, "step": 325 }, { "epoch": 1.3157894736842106, "eval_loss": 1.6884374618530273, "eval_mean_token_accuracy": 0.6019182503223419, "eval_num_tokens": 21699600.0, "eval_runtime": 0.6487, "eval_samples_per_second": 308.327, "eval_steps_per_second": 6.167, "step": 325 }, { "epoch": 1.319838056680162, "grad_norm": 6.268645760381665, "learning_rate": 6.842105263157896e-06, "loss": 13.0859, "mean_token_accuracy": 0.6107904314994812, "num_tokens": 21770004.0, "step": 326 }, { "epoch": 1.3238866396761133, "grad_norm": 6.480172792037266, "learning_rate": 6.801619433198381e-06, "loss": 13.4766, "mean_token_accuracy": 0.6045525670051575, "num_tokens": 21835174.0, "step": 327 }, { "epoch": 1.3279352226720649, "grad_norm": 6.89015735331616, "learning_rate": 6.761133603238867e-06, "loss": 12.0156, "mean_token_accuracy": 0.6316675543785095, "num_tokens": 21901773.0, "step": 328 }, { "epoch": 1.3319838056680162, "grad_norm": 6.136078812928923, "learning_rate": 6.720647773279353e-06, "loss": 12.8594, "mean_token_accuracy": 0.6160688400268555, "num_tokens": 21973454.0, "step": 329 }, { "epoch": 1.3360323886639676, "grad_norm": 6.530535217583928, "learning_rate": 6.6801619433198385e-06, "loss": 13.3281, "mean_token_accuracy": 0.608657717704773, "num_tokens": 22043421.0, "step": 330 }, { "epoch": 1.3360323886639676, "eval_loss": 1.6871875524520874, "eval_mean_token_accuracy": 0.6020240485668182, "eval_num_tokens": 22043421.0, "eval_runtime": 0.6344, "eval_samples_per_second": 315.283, "eval_steps_per_second": 6.306, "step": 330 }, { "epoch": 1.3400809716599191, "grad_norm": 7.032557524136393, "learning_rate": 6.639676113360325e-06, "loss": 13.6797, "mean_token_accuracy": 0.5984200835227966, "num_tokens": 22112983.0, "step": 331 }, { "epoch": 1.3441295546558705, "grad_norm": 6.878568332018199, "learning_rate": 6.599190283400811e-06, "loss": 13.625, "mean_token_accuracy": 0.60277259349823, "num_tokens": 22174362.0, "step": 332 }, { "epoch": 1.3481781376518218, "grad_norm": 6.2805599900614615, "learning_rate": 6.558704453441296e-06, "loss": 14.2031, "mean_token_accuracy": 0.592296838760376, "num_tokens": 22245929.0, "step": 333 }, { "epoch": 1.3522267206477734, "grad_norm": 6.851335318270516, "learning_rate": 6.518218623481782e-06, "loss": 13.8516, "mean_token_accuracy": 0.6001754403114319, "num_tokens": 22315536.0, "step": 334 }, { "epoch": 1.3562753036437247, "grad_norm": 6.690083307280702, "learning_rate": 6.4777327935222675e-06, "loss": 13.9688, "mean_token_accuracy": 0.5958229303359985, "num_tokens": 22381388.0, "step": 335 }, { "epoch": 1.3562753036437247, "eval_loss": 1.6865625381469727, "eval_mean_token_accuracy": 0.601932093501091, "eval_num_tokens": 22381388.0, "eval_runtime": 0.6419, "eval_samples_per_second": 311.575, "eval_steps_per_second": 6.232, "step": 335 }, { "epoch": 1.360323886639676, "grad_norm": 7.1780394505714336, "learning_rate": 6.437246963562754e-06, "loss": 12.5156, "mean_token_accuracy": 0.6246969699859619, "num_tokens": 22448683.0, "step": 336 }, { "epoch": 1.3643724696356276, "grad_norm": 7.289906239024176, "learning_rate": 6.39676113360324e-06, "loss": 13.5469, "mean_token_accuracy": 0.6012796759605408, "num_tokens": 22510793.0, "step": 337 }, { "epoch": 1.368421052631579, "grad_norm": 6.901589521281514, "learning_rate": 6.356275303643725e-06, "loss": 13.5234, "mean_token_accuracy": 0.6072494983673096, "num_tokens": 22579193.0, "step": 338 }, { "epoch": 1.3724696356275303, "grad_norm": 6.256711193594864, "learning_rate": 6.31578947368421e-06, "loss": 12.7188, "mean_token_accuracy": 0.622461199760437, "num_tokens": 22649271.0, "step": 339 }, { "epoch": 1.376518218623482, "grad_norm": 5.928363958207777, "learning_rate": 6.275303643724697e-06, "loss": 12.8203, "mean_token_accuracy": 0.6171985268592834, "num_tokens": 22721597.0, "step": 340 }, { "epoch": 1.376518218623482, "eval_loss": 1.6862499713897705, "eval_mean_token_accuracy": 0.6023992896080017, "eval_num_tokens": 22721597.0, "eval_runtime": 0.6436, "eval_samples_per_second": 310.773, "eval_steps_per_second": 6.215, "step": 340 }, { "epoch": 1.3805668016194332, "grad_norm": 7.046994398153724, "learning_rate": 6.234817813765183e-06, "loss": 13.5625, "mean_token_accuracy": 0.6081273555755615, "num_tokens": 22788300.0, "step": 341 }, { "epoch": 1.3846153846153846, "grad_norm": 5.679939194503434, "learning_rate": 6.194331983805668e-06, "loss": 12.5, "mean_token_accuracy": 0.6215187907218933, "num_tokens": 22859065.0, "step": 342 }, { "epoch": 1.3886639676113361, "grad_norm": 6.613152950818566, "learning_rate": 6.153846153846155e-06, "loss": 12.8125, "mean_token_accuracy": 0.6154517531394958, "num_tokens": 22923264.0, "step": 343 }, { "epoch": 1.3927125506072875, "grad_norm": 6.167835687979181, "learning_rate": 6.11336032388664e-06, "loss": 12.8047, "mean_token_accuracy": 0.6202464699745178, "num_tokens": 22980128.0, "step": 344 }, { "epoch": 1.3967611336032388, "grad_norm": 6.861084643476399, "learning_rate": 6.0728744939271254e-06, "loss": 12.0469, "mean_token_accuracy": 0.6357682347297668, "num_tokens": 23038976.0, "step": 345 }, { "epoch": 1.3967611336032388, "eval_loss": 1.6846874952316284, "eval_mean_token_accuracy": 0.6027306467294693, "eval_num_tokens": 23038976.0, "eval_runtime": 0.6479, "eval_samples_per_second": 308.696, "eval_steps_per_second": 6.174, "step": 345 }, { "epoch": 1.4008097165991904, "grad_norm": 6.0083674637757065, "learning_rate": 6.0323886639676124e-06, "loss": 12.8516, "mean_token_accuracy": 0.6229951977729797, "num_tokens": 23113673.0, "step": 346 }, { "epoch": 1.4048582995951417, "grad_norm": 6.91334940259032, "learning_rate": 5.991902834008098e-06, "loss": 13.6016, "mean_token_accuracy": 0.6017605066299438, "num_tokens": 23182354.0, "step": 347 }, { "epoch": 1.408906882591093, "grad_norm": 7.225053680436044, "learning_rate": 5.951417004048583e-06, "loss": 13.2812, "mean_token_accuracy": 0.6096216440200806, "num_tokens": 23244819.0, "step": 348 }, { "epoch": 1.4129554655870447, "grad_norm": 6.910314738528923, "learning_rate": 5.91093117408907e-06, "loss": 12.5781, "mean_token_accuracy": 0.6233025193214417, "num_tokens": 23301143.0, "step": 349 }, { "epoch": 1.417004048582996, "grad_norm": 13.172174214194689, "learning_rate": 5.870445344129555e-06, "loss": 13.7188, "mean_token_accuracy": 0.5989026427268982, "num_tokens": 23373743.0, "step": 350 }, { "epoch": 1.417004048582996, "eval_loss": 1.683750033378601, "eval_mean_token_accuracy": 0.6025040000677109, "eval_num_tokens": 23373743.0, "eval_runtime": 0.638, "eval_samples_per_second": 313.493, "eval_steps_per_second": 6.27, "step": 350 }, { "epoch": 1.4210526315789473, "grad_norm": 7.059247196559215, "learning_rate": 5.8299595141700406e-06, "loss": 12.2812, "mean_token_accuracy": 0.6239601373672485, "num_tokens": 23441002.0, "step": 351 }, { "epoch": 1.425101214574899, "grad_norm": 6.949818046383843, "learning_rate": 5.789473684210527e-06, "loss": 12.3125, "mean_token_accuracy": 0.6313892006874084, "num_tokens": 23495161.0, "step": 352 }, { "epoch": 1.4291497975708503, "grad_norm": 6.059843894633797, "learning_rate": 5.748987854251013e-06, "loss": 13.2344, "mean_token_accuracy": 0.6091973781585693, "num_tokens": 23564636.0, "step": 353 }, { "epoch": 1.4331983805668016, "grad_norm": 6.849778295075088, "learning_rate": 5.708502024291498e-06, "loss": 14.1484, "mean_token_accuracy": 0.5898481011390686, "num_tokens": 23634618.0, "step": 354 }, { "epoch": 1.4372469635627532, "grad_norm": 7.196515102955155, "learning_rate": 5.668016194331984e-06, "loss": 13.4844, "mean_token_accuracy": 0.608712911605835, "num_tokens": 23697715.0, "step": 355 }, { "epoch": 1.4372469635627532, "eval_loss": 1.683437466621399, "eval_mean_token_accuracy": 0.6026097536087036, "eval_num_tokens": 23697715.0, "eval_runtime": 0.6368, "eval_samples_per_second": 314.046, "eval_steps_per_second": 6.281, "step": 355 }, { "epoch": 1.4412955465587045, "grad_norm": 6.083851158318087, "learning_rate": 5.6275303643724695e-06, "loss": 13.1719, "mean_token_accuracy": 0.6111401319503784, "num_tokens": 23762805.0, "step": 356 }, { "epoch": 1.4453441295546559, "grad_norm": 6.3854204394565865, "learning_rate": 5.5870445344129565e-06, "loss": 14.1484, "mean_token_accuracy": 0.5870954990386963, "num_tokens": 23824801.0, "step": 357 }, { "epoch": 1.4493927125506074, "grad_norm": 6.205029195096519, "learning_rate": 5.546558704453442e-06, "loss": 12.5078, "mean_token_accuracy": 0.6245027184486389, "num_tokens": 23893486.0, "step": 358 }, { "epoch": 1.4534412955465588, "grad_norm": 6.498352605010108, "learning_rate": 5.506072874493927e-06, "loss": 13.1406, "mean_token_accuracy": 0.6103119254112244, "num_tokens": 23956739.0, "step": 359 }, { "epoch": 1.45748987854251, "grad_norm": 7.863658695213157, "learning_rate": 5.465587044534414e-06, "loss": 12.2969, "mean_token_accuracy": 0.6250467896461487, "num_tokens": 24015613.0, "step": 360 }, { "epoch": 1.45748987854251, "eval_loss": 1.6828124523162842, "eval_mean_token_accuracy": 0.6025930494070053, "eval_num_tokens": 24015613.0, "eval_runtime": 0.6414, "eval_samples_per_second": 311.802, "eval_steps_per_second": 6.236, "step": 360 }, { "epoch": 1.4615384615384617, "grad_norm": 6.004778979711357, "learning_rate": 5.425101214574899e-06, "loss": 13.375, "mean_token_accuracy": 0.6013794541358948, "num_tokens": 24093533.0, "step": 361 }, { "epoch": 1.465587044534413, "grad_norm": 6.524314866552431, "learning_rate": 5.384615384615385e-06, "loss": 13.1094, "mean_token_accuracy": 0.6128621101379395, "num_tokens": 24172029.0, "step": 362 }, { "epoch": 1.4696356275303644, "grad_norm": 5.757817859771559, "learning_rate": 5.344129554655872e-06, "loss": 12.7188, "mean_token_accuracy": 0.6142829060554504, "num_tokens": 24238844.0, "step": 363 }, { "epoch": 1.4736842105263157, "grad_norm": 5.843888667434192, "learning_rate": 5.303643724696357e-06, "loss": 12.4688, "mean_token_accuracy": 0.6290068030357361, "num_tokens": 24308602.0, "step": 364 }, { "epoch": 1.4777327935222673, "grad_norm": 7.095669449455518, "learning_rate": 5.263157894736842e-06, "loss": 13.2891, "mean_token_accuracy": 0.6107263565063477, "num_tokens": 24381011.0, "step": 365 }, { "epoch": 1.4777327935222673, "eval_loss": 1.6828124523162842, "eval_mean_token_accuracy": 0.6028460413217545, "eval_num_tokens": 24381011.0, "eval_runtime": 0.6426, "eval_samples_per_second": 311.246, "eval_steps_per_second": 6.225, "step": 365 }, { "epoch": 1.4817813765182186, "grad_norm": 7.089500617007727, "learning_rate": 5.222672064777329e-06, "loss": 13.2422, "mean_token_accuracy": 0.6082656383514404, "num_tokens": 24447301.0, "step": 366 }, { "epoch": 1.48582995951417, "grad_norm": 5.759008858515851, "learning_rate": 5.1821862348178145e-06, "loss": 11.7812, "mean_token_accuracy": 0.6373518109321594, "num_tokens": 24521514.0, "step": 367 }, { "epoch": 1.4898785425101215, "grad_norm": 7.257765350400875, "learning_rate": 5.1417004048583e-06, "loss": 13.4219, "mean_token_accuracy": 0.6083607077598572, "num_tokens": 24587266.0, "step": 368 }, { "epoch": 1.4939271255060729, "grad_norm": 6.617774961612806, "learning_rate": 5.101214574898786e-06, "loss": 11.9922, "mean_token_accuracy": 0.6397205591201782, "num_tokens": 24647883.0, "step": 369 }, { "epoch": 1.4979757085020242, "grad_norm": 6.034070184034884, "learning_rate": 5.060728744939272e-06, "loss": 14.0781, "mean_token_accuracy": 0.5942122936248779, "num_tokens": 24723728.0, "step": 370 }, { "epoch": 1.4979757085020242, "eval_loss": 1.682187557220459, "eval_mean_token_accuracy": 0.6028562039136887, "eval_num_tokens": 24723728.0, "eval_runtime": 0.6595, "eval_samples_per_second": 303.264, "eval_steps_per_second": 6.065, "step": 370 }, { "epoch": 1.5020242914979756, "grad_norm": 7.413748338165612, "learning_rate": 5.020242914979757e-06, "loss": 13.0234, "mean_token_accuracy": 0.6105521321296692, "num_tokens": 24786206.0, "step": 371 }, { "epoch": 1.5060728744939271, "grad_norm": 7.4171774757668745, "learning_rate": 4.9797570850202435e-06, "loss": 13.1641, "mean_token_accuracy": 0.6107207536697388, "num_tokens": 24851452.0, "step": 372 }, { "epoch": 1.5101214574898787, "grad_norm": 6.678125850261295, "learning_rate": 4.939271255060729e-06, "loss": 12.1406, "mean_token_accuracy": 0.6283853650093079, "num_tokens": 24912625.0, "step": 373 }, { "epoch": 1.5141700404858298, "grad_norm": 6.7202926787891375, "learning_rate": 4.898785425101215e-06, "loss": 13.0469, "mean_token_accuracy": 0.6010765433311462, "num_tokens": 24976596.0, "step": 374 }, { "epoch": 1.5182186234817814, "grad_norm": 7.189454405906928, "learning_rate": 4.8582995951417e-06, "loss": 11.9141, "mean_token_accuracy": 0.6381216049194336, "num_tokens": 25030493.0, "step": 375 }, { "epoch": 1.5182186234817814, "eval_loss": 1.6809375286102295, "eval_mean_token_accuracy": 0.6026807576417923, "eval_num_tokens": 25030493.0, "eval_runtime": 0.6393, "eval_samples_per_second": 312.855, "eval_steps_per_second": 6.257, "step": 375 }, { "epoch": 1.522267206477733, "grad_norm": 6.309792255182094, "learning_rate": 4.817813765182186e-06, "loss": 12.0938, "mean_token_accuracy": 0.6288850903511047, "num_tokens": 25094133.0, "step": 376 }, { "epoch": 1.526315789473684, "grad_norm": 7.06660860300255, "learning_rate": 4.7773279352226725e-06, "loss": 12.7812, "mean_token_accuracy": 0.6158961057662964, "num_tokens": 25155860.0, "step": 377 }, { "epoch": 1.5303643724696356, "grad_norm": 8.076900211471674, "learning_rate": 4.736842105263158e-06, "loss": 12.5859, "mean_token_accuracy": 0.6151171922683716, "num_tokens": 25211146.0, "step": 378 }, { "epoch": 1.5344129554655872, "grad_norm": 6.221731683405277, "learning_rate": 4.696356275303644e-06, "loss": 12.4609, "mean_token_accuracy": 0.6272507905960083, "num_tokens": 25280353.0, "step": 379 }, { "epoch": 1.5384615384615383, "grad_norm": 7.4685425851793665, "learning_rate": 4.65587044534413e-06, "loss": 13.1562, "mean_token_accuracy": 0.6171966195106506, "num_tokens": 25335393.0, "step": 380 }, { "epoch": 1.5384615384615383, "eval_loss": 1.6803125143051147, "eval_mean_token_accuracy": 0.6025642305612564, "eval_num_tokens": 25335393.0, "eval_runtime": 0.6333, "eval_samples_per_second": 315.815, "eval_steps_per_second": 6.316, "step": 380 }, { "epoch": 1.54251012145749, "grad_norm": 6.476918972266226, "learning_rate": 4.615384615384616e-06, "loss": 12.8594, "mean_token_accuracy": 0.6220955848693848, "num_tokens": 25413527.0, "step": 381 }, { "epoch": 1.5465587044534415, "grad_norm": 6.648186663706087, "learning_rate": 4.5748987854251014e-06, "loss": 12.5703, "mean_token_accuracy": 0.6231284141540527, "num_tokens": 25480245.0, "step": 382 }, { "epoch": 1.5506072874493926, "grad_norm": 7.367613086661741, "learning_rate": 4.534412955465588e-06, "loss": 13.6406, "mean_token_accuracy": 0.6029731631278992, "num_tokens": 25547914.0, "step": 383 }, { "epoch": 1.5546558704453441, "grad_norm": 5.503766366101462, "learning_rate": 4.493927125506074e-06, "loss": 13.8125, "mean_token_accuracy": 0.5945603847503662, "num_tokens": 25631660.0, "step": 384 }, { "epoch": 1.5587044534412957, "grad_norm": 6.8977689320409885, "learning_rate": 4.453441295546559e-06, "loss": 13.5156, "mean_token_accuracy": 0.6095985174179077, "num_tokens": 25696588.0, "step": 385 }, { "epoch": 1.5587044534412957, "eval_loss": 1.6799999475479126, "eval_mean_token_accuracy": 0.6023529767990112, "eval_num_tokens": 25696588.0, "eval_runtime": 0.636, "eval_samples_per_second": 314.454, "eval_steps_per_second": 6.289, "step": 385 }, { "epoch": 1.5627530364372468, "grad_norm": 6.227043197214259, "learning_rate": 4.412955465587045e-06, "loss": 13.2656, "mean_token_accuracy": 0.6109288930892944, "num_tokens": 25764637.0, "step": 386 }, { "epoch": 1.5668016194331984, "grad_norm": 6.437168132260638, "learning_rate": 4.372469635627531e-06, "loss": 13.0156, "mean_token_accuracy": 0.6073063611984253, "num_tokens": 25828262.0, "step": 387 }, { "epoch": 1.5708502024291497, "grad_norm": 6.443176214251158, "learning_rate": 4.3319838056680166e-06, "loss": 12.5391, "mean_token_accuracy": 0.6221445798873901, "num_tokens": 25894997.0, "step": 388 }, { "epoch": 1.574898785425101, "grad_norm": 6.541270347260077, "learning_rate": 4.291497975708503e-06, "loss": 12.5625, "mean_token_accuracy": 0.6267231106758118, "num_tokens": 25956940.0, "step": 389 }, { "epoch": 1.5789473684210527, "grad_norm": 6.622129593963462, "learning_rate": 4.251012145748988e-06, "loss": 12.5, "mean_token_accuracy": 0.620368242263794, "num_tokens": 26022017.0, "step": 390 }, { "epoch": 1.5789473684210527, "eval_loss": 1.6799999475479126, "eval_mean_token_accuracy": 0.6024383455514908, "eval_num_tokens": 26022017.0, "eval_runtime": 0.6355, "eval_samples_per_second": 314.693, "eval_steps_per_second": 6.294, "step": 390 }, { "epoch": 1.582995951417004, "grad_norm": 5.683943779857191, "learning_rate": 4.210526315789474e-06, "loss": 12.7578, "mean_token_accuracy": 0.6183803677558899, "num_tokens": 26089381.0, "step": 391 }, { "epoch": 1.5870445344129553, "grad_norm": 6.4439629905284, "learning_rate": 4.170040485829959e-06, "loss": 13.5859, "mean_token_accuracy": 0.6022599339485168, "num_tokens": 26160421.0, "step": 392 }, { "epoch": 1.591093117408907, "grad_norm": 6.482307740702656, "learning_rate": 4.1295546558704455e-06, "loss": 13.0938, "mean_token_accuracy": 0.6097571849822998, "num_tokens": 26227164.0, "step": 393 }, { "epoch": 1.5951417004048583, "grad_norm": 5.6243349055097775, "learning_rate": 4.089068825910931e-06, "loss": 12.7969, "mean_token_accuracy": 0.6154806613922119, "num_tokens": 26297716.0, "step": 394 }, { "epoch": 1.5991902834008096, "grad_norm": 7.1839751833286725, "learning_rate": 4.048582995951417e-06, "loss": 12.5312, "mean_token_accuracy": 0.627181351184845, "num_tokens": 26358579.0, "step": 395 }, { "epoch": 1.5991902834008096, "eval_loss": 1.6775000095367432, "eval_mean_token_accuracy": 0.6028738915920258, "eval_num_tokens": 26358579.0, "eval_runtime": 0.6349, "eval_samples_per_second": 314.994, "eval_steps_per_second": 6.3, "step": 395 }, { "epoch": 1.6032388663967612, "grad_norm": 6.286540348376537, "learning_rate": 4.008097165991903e-06, "loss": 12.8438, "mean_token_accuracy": 0.618080198764801, "num_tokens": 26423963.0, "step": 396 }, { "epoch": 1.6072874493927125, "grad_norm": 8.094171440991735, "learning_rate": 3.967611336032389e-06, "loss": 12.6172, "mean_token_accuracy": 0.6203653216362, "num_tokens": 26473680.0, "step": 397 }, { "epoch": 1.6113360323886639, "grad_norm": 6.734637207480726, "learning_rate": 3.9271255060728745e-06, "loss": 13.1016, "mean_token_accuracy": 0.6167324185371399, "num_tokens": 26531787.0, "step": 398 }, { "epoch": 1.6153846153846154, "grad_norm": 6.248322358568084, "learning_rate": 3.886639676113361e-06, "loss": 12.0234, "mean_token_accuracy": 0.6334991455078125, "num_tokens": 26589739.0, "step": 399 }, { "epoch": 1.6194331983805668, "grad_norm": 7.003524821469698, "learning_rate": 3.846153846153847e-06, "loss": 12.5938, "mean_token_accuracy": 0.6244420409202576, "num_tokens": 26650518.0, "step": 400 }, { "epoch": 1.6194331983805668, "eval_loss": 1.6775000095367432, "eval_mean_token_accuracy": 0.6030516028404236, "eval_num_tokens": 26650518.0, "eval_runtime": 0.6363, "eval_samples_per_second": 314.294, "eval_steps_per_second": 6.286, "step": 400 }, { "epoch": 1.623481781376518, "grad_norm": 6.575500256754098, "learning_rate": 3.805668016194332e-06, "loss": 14.2031, "mean_token_accuracy": 0.5973594784736633, "num_tokens": 26721628.0, "step": 401 }, { "epoch": 1.6275303643724697, "grad_norm": 6.399615806731187, "learning_rate": 3.7651821862348182e-06, "loss": 13.0859, "mean_token_accuracy": 0.6143900752067566, "num_tokens": 26795479.0, "step": 402 }, { "epoch": 1.631578947368421, "grad_norm": 6.108759765220742, "learning_rate": 3.724696356275304e-06, "loss": 12.5, "mean_token_accuracy": 0.6269562244415283, "num_tokens": 26858994.0, "step": 403 }, { "epoch": 1.6356275303643724, "grad_norm": 6.378724273369664, "learning_rate": 3.6842105263157896e-06, "loss": 12.7422, "mean_token_accuracy": 0.6249410510063171, "num_tokens": 26924821.0, "step": 404 }, { "epoch": 1.639676113360324, "grad_norm": 6.284255806434736, "learning_rate": 3.6437246963562758e-06, "loss": 12.1797, "mean_token_accuracy": 0.6245778203010559, "num_tokens": 26981139.0, "step": 405 }, { "epoch": 1.639676113360324, "eval_loss": 1.6775000095367432, "eval_mean_token_accuracy": 0.6031172126531601, "eval_num_tokens": 26981139.0, "eval_runtime": 0.6612, "eval_samples_per_second": 302.482, "eval_steps_per_second": 6.05, "step": 405 }, { "epoch": 1.6437246963562753, "grad_norm": 6.284046381701035, "learning_rate": 3.6032388663967615e-06, "loss": 12.8438, "mean_token_accuracy": 0.6137465238571167, "num_tokens": 27036897.0, "step": 406 }, { "epoch": 1.6477732793522266, "grad_norm": 6.192144168674015, "learning_rate": 3.562753036437247e-06, "loss": 13.4453, "mean_token_accuracy": 0.6057220697402954, "num_tokens": 27113856.0, "step": 407 }, { "epoch": 1.6518218623481782, "grad_norm": 7.747379888410599, "learning_rate": 3.522267206477733e-06, "loss": 13.2812, "mean_token_accuracy": 0.6113630533218384, "num_tokens": 27187122.0, "step": 408 }, { "epoch": 1.6558704453441295, "grad_norm": 7.619435387018495, "learning_rate": 3.481781376518219e-06, "loss": 12.625, "mean_token_accuracy": 0.6203478574752808, "num_tokens": 27245197.0, "step": 409 }, { "epoch": 1.6599190283400809, "grad_norm": 6.774444594833395, "learning_rate": 3.4412955465587043e-06, "loss": 13.4766, "mean_token_accuracy": 0.6078824996948242, "num_tokens": 27310976.0, "step": 410 }, { "epoch": 1.6599190283400809, "eval_loss": 1.6768749952316284, "eval_mean_token_accuracy": 0.6029177159070969, "eval_num_tokens": 27310976.0, "eval_runtime": 0.652, "eval_samples_per_second": 306.731, "eval_steps_per_second": 6.135, "step": 410 }, { "epoch": 1.6639676113360324, "grad_norm": 6.546428537195409, "learning_rate": 3.4008097165991905e-06, "loss": 13.0, "mean_token_accuracy": 0.6162744760513306, "num_tokens": 27381838.0, "step": 411 }, { "epoch": 1.6680161943319838, "grad_norm": 5.661705927031424, "learning_rate": 3.3603238866396766e-06, "loss": 12.9844, "mean_token_accuracy": 0.6101025938987732, "num_tokens": 27457427.0, "step": 412 }, { "epoch": 1.6720647773279351, "grad_norm": 6.634404347857771, "learning_rate": 3.3198380566801623e-06, "loss": 13.2344, "mean_token_accuracy": 0.6103484034538269, "num_tokens": 27525598.0, "step": 413 }, { "epoch": 1.6761133603238867, "grad_norm": 6.4588054803695165, "learning_rate": 3.279352226720648e-06, "loss": 13.0859, "mean_token_accuracy": 0.6082247495651245, "num_tokens": 27586138.0, "step": 414 }, { "epoch": 1.680161943319838, "grad_norm": 6.229468320759774, "learning_rate": 3.2388663967611337e-06, "loss": 13.2578, "mean_token_accuracy": 0.6039714813232422, "num_tokens": 27656654.0, "step": 415 }, { "epoch": 1.680161943319838, "eval_loss": 1.6768749952316284, "eval_mean_token_accuracy": 0.603041797876358, "eval_num_tokens": 27656654.0, "eval_runtime": 0.6416, "eval_samples_per_second": 311.713, "eval_steps_per_second": 6.234, "step": 415 }, { "epoch": 1.6842105263157894, "grad_norm": 6.530706970746754, "learning_rate": 3.19838056680162e-06, "loss": 12.9609, "mean_token_accuracy": 0.616111159324646, "num_tokens": 27719395.0, "step": 416 }, { "epoch": 1.688259109311741, "grad_norm": 5.974041721296059, "learning_rate": 3.157894736842105e-06, "loss": 13.0781, "mean_token_accuracy": 0.6097447872161865, "num_tokens": 27805700.0, "step": 417 }, { "epoch": 1.6923076923076923, "grad_norm": 6.9520061051049655, "learning_rate": 3.1174089068825913e-06, "loss": 13.1328, "mean_token_accuracy": 0.6185556054115295, "num_tokens": 27870607.0, "step": 418 }, { "epoch": 1.6963562753036436, "grad_norm": 6.687485877625691, "learning_rate": 3.0769230769230774e-06, "loss": 13.125, "mean_token_accuracy": 0.6092858910560608, "num_tokens": 27935199.0, "step": 419 }, { "epoch": 1.7004048582995952, "grad_norm": 5.89529220246302, "learning_rate": 3.0364372469635627e-06, "loss": 12.7344, "mean_token_accuracy": 0.6122572422027588, "num_tokens": 27999633.0, "step": 420 }, { "epoch": 1.7004048582995952, "eval_loss": 1.6762499809265137, "eval_mean_token_accuracy": 0.6032748818397522, "eval_num_tokens": 27999633.0, "eval_runtime": 0.6365, "eval_samples_per_second": 314.198, "eval_steps_per_second": 6.284, "step": 420 }, { "epoch": 1.7044534412955465, "grad_norm": 5.97670859064412, "learning_rate": 2.995951417004049e-06, "loss": 13.6172, "mean_token_accuracy": 0.6090903878211975, "num_tokens": 28070629.0, "step": 421 }, { "epoch": 1.708502024291498, "grad_norm": 6.35673530964282, "learning_rate": 2.955465587044535e-06, "loss": 14.1406, "mean_token_accuracy": 0.5912197828292847, "num_tokens": 28143265.0, "step": 422 }, { "epoch": 1.7125506072874495, "grad_norm": 6.087049071022692, "learning_rate": 2.9149797570850203e-06, "loss": 12.7266, "mean_token_accuracy": 0.6160033345222473, "num_tokens": 28217426.0, "step": 423 }, { "epoch": 1.7165991902834008, "grad_norm": 6.5904480079951915, "learning_rate": 2.8744939271255064e-06, "loss": 12.9141, "mean_token_accuracy": 0.6187474727630615, "num_tokens": 28286022.0, "step": 424 }, { "epoch": 1.7206477732793521, "grad_norm": 6.195739080649372, "learning_rate": 2.834008097165992e-06, "loss": 13.5859, "mean_token_accuracy": 0.6009435057640076, "num_tokens": 28356036.0, "step": 425 }, { "epoch": 1.7206477732793521, "eval_loss": 1.6762499809265137, "eval_mean_token_accuracy": 0.6033691614866257, "eval_num_tokens": 28356036.0, "eval_runtime": 0.641, "eval_samples_per_second": 312.0, "eval_steps_per_second": 6.24, "step": 425 }, { "epoch": 1.7246963562753037, "grad_norm": 6.307993289907047, "learning_rate": 2.7935222672064783e-06, "loss": 13.1875, "mean_token_accuracy": 0.6147733926773071, "num_tokens": 28430355.0, "step": 426 }, { "epoch": 1.728744939271255, "grad_norm": 6.289447300448821, "learning_rate": 2.7530364372469636e-06, "loss": 12.5156, "mean_token_accuracy": 0.6260796785354614, "num_tokens": 28494906.0, "step": 427 }, { "epoch": 1.7327935222672064, "grad_norm": 6.826350614499696, "learning_rate": 2.7125506072874497e-06, "loss": 12.6094, "mean_token_accuracy": 0.6202298998832703, "num_tokens": 28562395.0, "step": 428 }, { "epoch": 1.736842105263158, "grad_norm": 6.595176524843769, "learning_rate": 2.672064777327936e-06, "loss": 12.8047, "mean_token_accuracy": 0.6235784292221069, "num_tokens": 28624889.0, "step": 429 }, { "epoch": 1.7408906882591093, "grad_norm": 5.691303853471213, "learning_rate": 2.631578947368421e-06, "loss": 13.2656, "mean_token_accuracy": 0.6069343686103821, "num_tokens": 28698788.0, "step": 430 }, { "epoch": 1.7408906882591093, "eval_loss": 1.675624966621399, "eval_mean_token_accuracy": 0.603339210152626, "eval_num_tokens": 28698788.0, "eval_runtime": 0.6338, "eval_samples_per_second": 315.581, "eval_steps_per_second": 6.312, "step": 430 }, { "epoch": 1.7449392712550607, "grad_norm": 5.997572345174048, "learning_rate": 2.5910931174089072e-06, "loss": 12.9844, "mean_token_accuracy": 0.6104946732521057, "num_tokens": 28773747.0, "step": 431 }, { "epoch": 1.7489878542510122, "grad_norm": 7.249965290425393, "learning_rate": 2.550607287449393e-06, "loss": 13.4297, "mean_token_accuracy": 0.6104491353034973, "num_tokens": 28842544.0, "step": 432 }, { "epoch": 1.7530364372469636, "grad_norm": 6.000414268983646, "learning_rate": 2.5101214574898787e-06, "loss": 12.5781, "mean_token_accuracy": 0.6204512119293213, "num_tokens": 28912462.0, "step": 433 }, { "epoch": 1.757085020242915, "grad_norm": 6.118904164123435, "learning_rate": 2.4696356275303644e-06, "loss": 12.7109, "mean_token_accuracy": 0.6210896968841553, "num_tokens": 28992347.0, "step": 434 }, { "epoch": 1.7611336032388665, "grad_norm": 6.1441994160988065, "learning_rate": 2.42914979757085e-06, "loss": 11.4531, "mean_token_accuracy": 0.6428948044776917, "num_tokens": 29066420.0, "step": 435 }, { "epoch": 1.7611336032388665, "eval_loss": 1.6749999523162842, "eval_mean_token_accuracy": 0.6034704595804214, "eval_num_tokens": 29066420.0, "eval_runtime": 0.6572, "eval_samples_per_second": 304.314, "eval_steps_per_second": 6.086, "step": 435 }, { "epoch": 1.7651821862348178, "grad_norm": 8.29981852170875, "learning_rate": 2.3886639676113362e-06, "loss": 12.7266, "mean_token_accuracy": 0.6216300129890442, "num_tokens": 29140409.0, "step": 436 }, { "epoch": 1.7692307692307692, "grad_norm": 7.051595151204688, "learning_rate": 2.348178137651822e-06, "loss": 12.9375, "mean_token_accuracy": 0.6193385124206543, "num_tokens": 29200219.0, "step": 437 }, { "epoch": 1.7732793522267207, "grad_norm": 6.304795255922526, "learning_rate": 2.307692307692308e-06, "loss": 12.4688, "mean_token_accuracy": 0.6236432194709778, "num_tokens": 29271314.0, "step": 438 }, { "epoch": 1.777327935222672, "grad_norm": 5.556856434206601, "learning_rate": 2.267206477732794e-06, "loss": 12.4609, "mean_token_accuracy": 0.6236319541931152, "num_tokens": 29342739.0, "step": 439 }, { "epoch": 1.7813765182186234, "grad_norm": 7.6181275587053445, "learning_rate": 2.2267206477732795e-06, "loss": 12.3359, "mean_token_accuracy": 0.6281608939170837, "num_tokens": 29399868.0, "step": 440 }, { "epoch": 1.7813765182186234, "eval_loss": 1.6746875047683716, "eval_mean_token_accuracy": 0.6035052537918091, "eval_num_tokens": 29399868.0, "eval_runtime": 0.6412, "eval_samples_per_second": 311.93, "eval_steps_per_second": 6.239, "step": 440 }, { "epoch": 1.785425101214575, "grad_norm": 7.163759421924407, "learning_rate": 2.1862348178137656e-06, "loss": 13.4844, "mean_token_accuracy": 0.6067489385604858, "num_tokens": 29471825.0, "step": 441 }, { "epoch": 1.7894736842105263, "grad_norm": 5.980379834577915, "learning_rate": 2.1457489878542513e-06, "loss": 12.6953, "mean_token_accuracy": 0.6232491135597229, "num_tokens": 29543780.0, "step": 442 }, { "epoch": 1.7935222672064777, "grad_norm": 5.658389056927855, "learning_rate": 2.105263157894737e-06, "loss": 12.9844, "mean_token_accuracy": 0.6110087633132935, "num_tokens": 29620910.0, "step": 443 }, { "epoch": 1.7975708502024292, "grad_norm": 5.632632433555104, "learning_rate": 2.0647773279352228e-06, "loss": 12.3203, "mean_token_accuracy": 0.6279122829437256, "num_tokens": 29695744.0, "step": 444 }, { "epoch": 1.8016194331983806, "grad_norm": 5.67469055605223, "learning_rate": 2.0242914979757085e-06, "loss": 12.5547, "mean_token_accuracy": 0.6283406615257263, "num_tokens": 29767874.0, "step": 445 }, { "epoch": 1.8016194331983806, "eval_loss": 1.6746875047683716, "eval_mean_token_accuracy": 0.6034050285816193, "eval_num_tokens": 29767874.0, "eval_runtime": 0.6422, "eval_samples_per_second": 311.443, "eval_steps_per_second": 6.229, "step": 445 }, { "epoch": 1.805668016194332, "grad_norm": 6.569962521896024, "learning_rate": 1.9838056680161946e-06, "loss": 11.7422, "mean_token_accuracy": 0.6383576393127441, "num_tokens": 29831675.0, "step": 446 }, { "epoch": 1.8097165991902835, "grad_norm": 6.86808486740946, "learning_rate": 1.9433198380566803e-06, "loss": 13.7891, "mean_token_accuracy": 0.5985156893730164, "num_tokens": 29900728.0, "step": 447 }, { "epoch": 1.8137651821862348, "grad_norm": 5.57356193397258, "learning_rate": 1.902834008097166e-06, "loss": 13.1406, "mean_token_accuracy": 0.6123852133750916, "num_tokens": 29970601.0, "step": 448 }, { "epoch": 1.8178137651821862, "grad_norm": 6.094864274620779, "learning_rate": 1.862348178137652e-06, "loss": 13.1562, "mean_token_accuracy": 0.613720178604126, "num_tokens": 30034673.0, "step": 449 }, { "epoch": 1.8218623481781377, "grad_norm": 7.163324826123694, "learning_rate": 1.8218623481781379e-06, "loss": 12.1641, "mean_token_accuracy": 0.6280580759048462, "num_tokens": 30091840.0, "step": 450 }, { "epoch": 1.8218623481781377, "eval_loss": 1.6737500429153442, "eval_mean_token_accuracy": 0.6033113747835159, "eval_num_tokens": 30091840.0, "eval_runtime": 0.6388, "eval_samples_per_second": 313.097, "eval_steps_per_second": 6.262, "step": 450 }, { "epoch": 1.825910931174089, "grad_norm": 5.721465202296356, "learning_rate": 1.7813765182186236e-06, "loss": 13.1172, "mean_token_accuracy": 0.6090490818023682, "num_tokens": 30160795.0, "step": 451 }, { "epoch": 1.8299595141700404, "grad_norm": 5.834496019461034, "learning_rate": 1.7408906882591095e-06, "loss": 11.6094, "mean_token_accuracy": 0.6369089484214783, "num_tokens": 30223207.0, "step": 452 }, { "epoch": 1.834008097165992, "grad_norm": 5.989944536955842, "learning_rate": 1.7004048582995952e-06, "loss": 12.8125, "mean_token_accuracy": 0.6168772578239441, "num_tokens": 30292998.0, "step": 453 }, { "epoch": 1.8380566801619433, "grad_norm": 6.175252808359519, "learning_rate": 1.6599190283400812e-06, "loss": 12.4141, "mean_token_accuracy": 0.6250400543212891, "num_tokens": 30358645.0, "step": 454 }, { "epoch": 1.8421052631578947, "grad_norm": 6.320123552822768, "learning_rate": 1.6194331983805669e-06, "loss": 12.7969, "mean_token_accuracy": 0.6187460422515869, "num_tokens": 30426126.0, "step": 455 }, { "epoch": 1.8421052631578947, "eval_loss": 1.6737500429153442, "eval_mean_token_accuracy": 0.6035889238119125, "eval_num_tokens": 30426126.0, "eval_runtime": 0.6462, "eval_samples_per_second": 309.508, "eval_steps_per_second": 6.19, "step": 455 }, { "epoch": 1.8461538461538463, "grad_norm": 7.054930335620368, "learning_rate": 1.5789473684210526e-06, "loss": 12.9453, "mean_token_accuracy": 0.6147574186325073, "num_tokens": 30490700.0, "step": 456 }, { "epoch": 1.8502024291497976, "grad_norm": 5.963169448840178, "learning_rate": 1.5384615384615387e-06, "loss": 12.7891, "mean_token_accuracy": 0.6221683025360107, "num_tokens": 30564085.0, "step": 457 }, { "epoch": 1.854251012145749, "grad_norm": 6.545239654505327, "learning_rate": 1.4979757085020244e-06, "loss": 13.8984, "mean_token_accuracy": 0.6013196110725403, "num_tokens": 30629773.0, "step": 458 }, { "epoch": 1.8582995951417005, "grad_norm": 7.19847490245891, "learning_rate": 1.4574898785425101e-06, "loss": 12.6953, "mean_token_accuracy": 0.6308658123016357, "num_tokens": 30685115.0, "step": 459 }, { "epoch": 1.8623481781376519, "grad_norm": 6.402339501112951, "learning_rate": 1.417004048582996e-06, "loss": 14.625, "mean_token_accuracy": 0.5789289474487305, "num_tokens": 30759499.0, "step": 460 }, { "epoch": 1.8623481781376519, "eval_loss": 1.6737500429153442, "eval_mean_token_accuracy": 0.6036617457866669, "eval_num_tokens": 30759499.0, "eval_runtime": 0.6471, "eval_samples_per_second": 309.048, "eval_steps_per_second": 6.181, "step": 460 }, { "epoch": 1.8663967611336032, "grad_norm": 6.554938664776423, "learning_rate": 1.3765182186234818e-06, "loss": 12.0312, "mean_token_accuracy": 0.6326593160629272, "num_tokens": 30821383.0, "step": 461 }, { "epoch": 1.8704453441295548, "grad_norm": 7.057225553782759, "learning_rate": 1.336032388663968e-06, "loss": 12.2266, "mean_token_accuracy": 0.6232849359512329, "num_tokens": 30881357.0, "step": 462 }, { "epoch": 1.874493927125506, "grad_norm": 5.899931807473347, "learning_rate": 1.2955465587044536e-06, "loss": 12.8906, "mean_token_accuracy": 0.6196330785751343, "num_tokens": 30944702.0, "step": 463 }, { "epoch": 1.8785425101214575, "grad_norm": 6.484707334873664, "learning_rate": 1.2550607287449393e-06, "loss": 12.6484, "mean_token_accuracy": 0.6225480437278748, "num_tokens": 31018431.0, "step": 464 }, { "epoch": 1.882591093117409, "grad_norm": 6.294801781620055, "learning_rate": 1.214574898785425e-06, "loss": 12.8203, "mean_token_accuracy": 0.6174430847167969, "num_tokens": 31081947.0, "step": 465 }, { "epoch": 1.882591093117409, "eval_loss": 1.6731250286102295, "eval_mean_token_accuracy": 0.6035991758108139, "eval_num_tokens": 31081947.0, "eval_runtime": 0.6369, "eval_samples_per_second": 314.028, "eval_steps_per_second": 6.281, "step": 465 }, { "epoch": 1.8866396761133604, "grad_norm": 7.0152791422709795, "learning_rate": 1.174089068825911e-06, "loss": 13.6797, "mean_token_accuracy": 0.5981454849243164, "num_tokens": 31147796.0, "step": 466 }, { "epoch": 1.8906882591093117, "grad_norm": 6.303822986663203, "learning_rate": 1.133603238866397e-06, "loss": 13.2266, "mean_token_accuracy": 0.6078663468360901, "num_tokens": 31212134.0, "step": 467 }, { "epoch": 1.8947368421052633, "grad_norm": 5.882937931956007, "learning_rate": 1.0931174089068828e-06, "loss": 13.8359, "mean_token_accuracy": 0.6062960624694824, "num_tokens": 31282591.0, "step": 468 }, { "epoch": 1.8987854251012146, "grad_norm": 5.937096953516108, "learning_rate": 1.0526315789473685e-06, "loss": 12.3516, "mean_token_accuracy": 0.6282551884651184, "num_tokens": 31354656.0, "step": 469 }, { "epoch": 1.902834008097166, "grad_norm": 6.3577908169539255, "learning_rate": 1.0121457489878542e-06, "loss": 12.9531, "mean_token_accuracy": 0.6160109043121338, "num_tokens": 31420675.0, "step": 470 }, { "epoch": 1.902834008097166, "eval_loss": 1.673437476158142, "eval_mean_token_accuracy": 0.6037902683019638, "eval_num_tokens": 31420675.0, "eval_runtime": 0.6715, "eval_samples_per_second": 297.827, "eval_steps_per_second": 5.957, "step": 470 }, { "epoch": 1.9068825910931175, "grad_norm": 6.62892566578751, "learning_rate": 9.716599190283402e-07, "loss": 13.2188, "mean_token_accuracy": 0.6141934990882874, "num_tokens": 31486290.0, "step": 471 }, { "epoch": 1.9109311740890689, "grad_norm": 6.1078260873922625, "learning_rate": 9.31174089068826e-07, "loss": 12.6641, "mean_token_accuracy": 0.6177834868431091, "num_tokens": 31557060.0, "step": 472 }, { "epoch": 1.9149797570850202, "grad_norm": 6.925934956753748, "learning_rate": 8.906882591093118e-07, "loss": 13.0938, "mean_token_accuracy": 0.613021969795227, "num_tokens": 31628035.0, "step": 473 }, { "epoch": 1.9190283400809718, "grad_norm": 5.645167673990774, "learning_rate": 8.502024291497976e-07, "loss": 13.0625, "mean_token_accuracy": 0.6135348677635193, "num_tokens": 31704450.0, "step": 474 }, { "epoch": 1.9230769230769231, "grad_norm": 6.3104167802865385, "learning_rate": 8.097165991902834e-07, "loss": 12.6016, "mean_token_accuracy": 0.6247060298919678, "num_tokens": 31771696.0, "step": 475 }, { "epoch": 1.9230769230769231, "eval_loss": 1.6740624904632568, "eval_mean_token_accuracy": 0.6038688272237778, "eval_num_tokens": 31771696.0, "eval_runtime": 0.6717, "eval_samples_per_second": 297.737, "eval_steps_per_second": 5.955, "step": 475 }, { "epoch": 1.9271255060728745, "grad_norm": 8.75546525050239, "learning_rate": 7.692307692307694e-07, "loss": 14.3125, "mean_token_accuracy": 0.6006154417991638, "num_tokens": 31833828.0, "step": 476 }, { "epoch": 1.931174089068826, "grad_norm": 6.568540834125108, "learning_rate": 7.287449392712551e-07, "loss": 12.4688, "mean_token_accuracy": 0.6260876655578613, "num_tokens": 31908019.0, "step": 477 }, { "epoch": 1.9352226720647774, "grad_norm": 5.992577753842725, "learning_rate": 6.882591093117409e-07, "loss": 11.6328, "mean_token_accuracy": 0.64389967918396, "num_tokens": 31968785.0, "step": 478 }, { "epoch": 1.9392712550607287, "grad_norm": 6.737090705162421, "learning_rate": 6.477732793522268e-07, "loss": 12.6562, "mean_token_accuracy": 0.629744291305542, "num_tokens": 32032478.0, "step": 479 }, { "epoch": 1.9433198380566803, "grad_norm": 6.273424964829595, "learning_rate": 6.072874493927125e-07, "loss": 12.1953, "mean_token_accuracy": 0.6316378116607666, "num_tokens": 32096023.0, "step": 480 }, { "epoch": 1.9433198380566803, "eval_loss": 1.6731250286102295, "eval_mean_token_accuracy": 0.6038562655448914, "eval_num_tokens": 32096023.0, "eval_runtime": 0.637, "eval_samples_per_second": 313.985, "eval_steps_per_second": 6.28, "step": 480 }, { "epoch": 1.9473684210526314, "grad_norm": 6.127164798465686, "learning_rate": 5.668016194331984e-07, "loss": 12.1719, "mean_token_accuracy": 0.6262784600257874, "num_tokens": 32164919.0, "step": 481 }, { "epoch": 1.951417004048583, "grad_norm": 5.756695804667141, "learning_rate": 5.263157894736843e-07, "loss": 12.6875, "mean_token_accuracy": 0.6200114488601685, "num_tokens": 32233114.0, "step": 482 }, { "epoch": 1.9554655870445345, "grad_norm": 6.564768636864393, "learning_rate": 4.858299595141701e-07, "loss": 13.1953, "mean_token_accuracy": 0.6086875796318054, "num_tokens": 32303117.0, "step": 483 }, { "epoch": 1.9595141700404857, "grad_norm": 7.252534772707513, "learning_rate": 4.453441295546559e-07, "loss": 13.25, "mean_token_accuracy": 0.6089531779289246, "num_tokens": 32373927.0, "step": 484 }, { "epoch": 1.9635627530364372, "grad_norm": 6.803603694268994, "learning_rate": 4.048582995951417e-07, "loss": 12.4609, "mean_token_accuracy": 0.624879777431488, "num_tokens": 32444699.0, "step": 485 }, { "epoch": 1.9635627530364372, "eval_loss": 1.6731250286102295, "eval_mean_token_accuracy": 0.6039727181196213, "eval_num_tokens": 32444699.0, "eval_runtime": 0.6475, "eval_samples_per_second": 308.869, "eval_steps_per_second": 6.177, "step": 485 }, { "epoch": 1.9676113360323888, "grad_norm": 6.049306687892726, "learning_rate": 3.6437246963562754e-07, "loss": 14.0625, "mean_token_accuracy": 0.593756914138794, "num_tokens": 32516907.0, "step": 486 }, { "epoch": 1.97165991902834, "grad_norm": 7.270949297430324, "learning_rate": 3.238866396761134e-07, "loss": 12.6406, "mean_token_accuracy": 0.6211447715759277, "num_tokens": 32583342.0, "step": 487 }, { "epoch": 1.9757085020242915, "grad_norm": 8.038406126817675, "learning_rate": 2.834008097165992e-07, "loss": 12.9766, "mean_token_accuracy": 0.6204248666763306, "num_tokens": 32639142.0, "step": 488 }, { "epoch": 1.979757085020243, "grad_norm": 6.211130232502858, "learning_rate": 2.4291497975708504e-07, "loss": 12.1953, "mean_token_accuracy": 0.6283571124076843, "num_tokens": 32705595.0, "step": 489 }, { "epoch": 1.9838056680161942, "grad_norm": 6.72573911112471, "learning_rate": 2.0242914979757086e-07, "loss": 13.8281, "mean_token_accuracy": 0.5990269184112549, "num_tokens": 32771222.0, "step": 490 }, { "epoch": 1.9838056680161942, "eval_loss": 1.6731250286102295, "eval_mean_token_accuracy": 0.6039326936006546, "eval_num_tokens": 32771222.0, "eval_runtime": 0.639, "eval_samples_per_second": 312.997, "eval_steps_per_second": 6.26, "step": 490 }, { "epoch": 1.9878542510121457, "grad_norm": 6.616901783015977, "learning_rate": 1.619433198380567e-07, "loss": 12.0078, "mean_token_accuracy": 0.6346266865730286, "num_tokens": 32827549.0, "step": 491 }, { "epoch": 1.9919028340080973, "grad_norm": 6.065995562899032, "learning_rate": 1.2145748987854252e-07, "loss": 13.1484, "mean_token_accuracy": 0.6112900972366333, "num_tokens": 32900562.0, "step": 492 }, { "epoch": 1.9959514170040484, "grad_norm": 6.877917660573328, "learning_rate": 8.097165991902835e-08, "loss": 13.1406, "mean_token_accuracy": 0.6131631135940552, "num_tokens": 32967282.0, "step": 493 }, { "epoch": 2.0, "grad_norm": 6.297817735745015, "learning_rate": 4.0485829959514176e-08, "loss": 12.4141, "mean_token_accuracy": 0.6238685250282288, "num_tokens": 33035067.0, "step": 494 }, { "epoch": 2.0, "step": 494, "total_flos": 39806562140160.0, "train_loss": 13.738502656882591, "train_runtime": 467.8481, "train_samples_per_second": 67.569, "train_steps_per_second": 1.056 } ], "logging_steps": 1.0, "max_steps": 494, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 39806562140160.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }