| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 5, | |
| "global_step": 494, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004048582995951417, | |
| "grad_norm": 57.285691125278106, | |
| "learning_rate": 2e-05, | |
| "loss": 20.1406, | |
| "mean_token_accuracy": 0.5111725330352783, | |
| "num_tokens": 72429.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008097165991902834, | |
| "grad_norm": 41.99363066045992, | |
| "learning_rate": 1.9959514170040488e-05, | |
| "loss": 18.6875, | |
| "mean_token_accuracy": 0.5240641832351685, | |
| "num_tokens": 136634.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.012145748987854251, | |
| "grad_norm": 38.5726807470561, | |
| "learning_rate": 1.9919028340080974e-05, | |
| "loss": 19.4375, | |
| "mean_token_accuracy": 0.5193748474121094, | |
| "num_tokens": 204776.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.016194331983805668, | |
| "grad_norm": 36.354880235444575, | |
| "learning_rate": 1.987854251012146e-05, | |
| "loss": 18.5859, | |
| "mean_token_accuracy": 0.5202323198318481, | |
| "num_tokens": 271293.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.020242914979757085, | |
| "grad_norm": 28.213868324309114, | |
| "learning_rate": 1.9838056680161946e-05, | |
| "loss": 17.4609, | |
| "mean_token_accuracy": 0.5324159860610962, | |
| "num_tokens": 332546.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.020242914979757085, | |
| "eval_loss": 2.1596875190734863, | |
| "eval_mean_token_accuracy": 0.5420514196157455, | |
| "eval_num_tokens": 332546.0, | |
| "eval_runtime": 3.902, | |
| "eval_samples_per_second": 51.256, | |
| "eval_steps_per_second": 1.025, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.024291497975708502, | |
| "grad_norm": 27.134077029097234, | |
| "learning_rate": 1.979757085020243e-05, | |
| "loss": 17.4922, | |
| "mean_token_accuracy": 0.5339857339859009, | |
| "num_tokens": 402301.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02834008097165992, | |
| "grad_norm": 18.222480703668843, | |
| "learning_rate": 1.9757085020242915e-05, | |
| "loss": 17.1172, | |
| "mean_token_accuracy": 0.5464459657669067, | |
| "num_tokens": 461692.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.032388663967611336, | |
| "grad_norm": 15.082689481608242, | |
| "learning_rate": 1.9716599190283405e-05, | |
| "loss": 16.8516, | |
| "mean_token_accuracy": 0.5461285710334778, | |
| "num_tokens": 538227.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03643724696356275, | |
| "grad_norm": 12.197205828835985, | |
| "learning_rate": 1.9676113360323887e-05, | |
| "loss": 16.1562, | |
| "mean_token_accuracy": 0.5573914647102356, | |
| "num_tokens": 603109.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.04048582995951417, | |
| "grad_norm": 13.717242431104397, | |
| "learning_rate": 1.9635627530364373e-05, | |
| "loss": 15.7266, | |
| "mean_token_accuracy": 0.5678688287734985, | |
| "num_tokens": 659598.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04048582995951417, | |
| "eval_loss": 2.0374999046325684, | |
| "eval_mean_token_accuracy": 0.5590415745973587, | |
| "eval_num_tokens": 659598.0, | |
| "eval_runtime": 0.6198, | |
| "eval_samples_per_second": 322.698, | |
| "eval_steps_per_second": 6.454, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.044534412955465584, | |
| "grad_norm": 12.306486360056866, | |
| "learning_rate": 1.959514170040486e-05, | |
| "loss": 15.6484, | |
| "mean_token_accuracy": 0.5803827047348022, | |
| "num_tokens": 714481.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.048582995951417005, | |
| "grad_norm": 11.708980436467652, | |
| "learning_rate": 1.9554655870445346e-05, | |
| "loss": 15.4062, | |
| "mean_token_accuracy": 0.578364908695221, | |
| "num_tokens": 778432.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 11.35292985675568, | |
| "learning_rate": 1.9514170040485832e-05, | |
| "loss": 16.2422, | |
| "mean_token_accuracy": 0.5603257417678833, | |
| "num_tokens": 843576.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.05668016194331984, | |
| "grad_norm": 11.721306843681429, | |
| "learning_rate": 1.9473684210526318e-05, | |
| "loss": 15.4141, | |
| "mean_token_accuracy": 0.5766724944114685, | |
| "num_tokens": 914252.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06072874493927125, | |
| "grad_norm": 11.371453682604804, | |
| "learning_rate": 1.94331983805668e-05, | |
| "loss": 15.2969, | |
| "mean_token_accuracy": 0.5723595023155212, | |
| "num_tokens": 976160.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06072874493927125, | |
| "eval_loss": 1.9924999475479126, | |
| "eval_mean_token_accuracy": 0.5654653459787369, | |
| "eval_num_tokens": 976160.0, | |
| "eval_runtime": 0.6214, | |
| "eval_samples_per_second": 321.855, | |
| "eval_steps_per_second": 6.437, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06477732793522267, | |
| "grad_norm": 13.380354993808568, | |
| "learning_rate": 1.939271255060729e-05, | |
| "loss": 17.2188, | |
| "mean_token_accuracy": 0.5414544939994812, | |
| "num_tokens": 1046132.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.06882591093117409, | |
| "grad_norm": 12.756822173097444, | |
| "learning_rate": 1.9352226720647776e-05, | |
| "loss": 15.7188, | |
| "mean_token_accuracy": 0.5702171325683594, | |
| "num_tokens": 1111408.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0728744939271255, | |
| "grad_norm": 10.541413178483012, | |
| "learning_rate": 1.931174089068826e-05, | |
| "loss": 16.6406, | |
| "mean_token_accuracy": 0.5545904040336609, | |
| "num_tokens": 1181008.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.07692307692307693, | |
| "grad_norm": 9.058216089424022, | |
| "learning_rate": 1.9271255060728745e-05, | |
| "loss": 15.4922, | |
| "mean_token_accuracy": 0.5672646164894104, | |
| "num_tokens": 1242895.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08097165991902834, | |
| "grad_norm": 8.224696140023758, | |
| "learning_rate": 1.923076923076923e-05, | |
| "loss": 14.5, | |
| "mean_token_accuracy": 0.5876371264457703, | |
| "num_tokens": 1321339.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08097165991902834, | |
| "eval_loss": 1.959375023841858, | |
| "eval_mean_token_accuracy": 0.5695992410182953, | |
| "eval_num_tokens": 1321339.0, | |
| "eval_runtime": 0.6175, | |
| "eval_samples_per_second": 323.911, | |
| "eval_steps_per_second": 6.478, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08502024291497975, | |
| "grad_norm": 9.194280544253612, | |
| "learning_rate": 1.9190283400809718e-05, | |
| "loss": 15.7969, | |
| "mean_token_accuracy": 0.5668182373046875, | |
| "num_tokens": 1388271.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.08906882591093117, | |
| "grad_norm": 9.173673841849936, | |
| "learning_rate": 1.9149797570850204e-05, | |
| "loss": 16.1875, | |
| "mean_token_accuracy": 0.5593817234039307, | |
| "num_tokens": 1456462.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0931174089068826, | |
| "grad_norm": 9.571035474364693, | |
| "learning_rate": 1.910931174089069e-05, | |
| "loss": 15.8516, | |
| "mean_token_accuracy": 0.5684004426002502, | |
| "num_tokens": 1518799.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.09716599190283401, | |
| "grad_norm": 7.975715179302099, | |
| "learning_rate": 1.9068825910931176e-05, | |
| "loss": 14.9844, | |
| "mean_token_accuracy": 0.5802298188209534, | |
| "num_tokens": 1588301.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.10121457489878542, | |
| "grad_norm": 8.06780763084373, | |
| "learning_rate": 1.9028340080971662e-05, | |
| "loss": 15.9766, | |
| "mean_token_accuracy": 0.5611966848373413, | |
| "num_tokens": 1665044.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10121457489878542, | |
| "eval_loss": 1.9378124475479126, | |
| "eval_mean_token_accuracy": 0.571557804942131, | |
| "eval_num_tokens": 1665044.0, | |
| "eval_runtime": 0.6132, | |
| "eval_samples_per_second": 326.145, | |
| "eval_steps_per_second": 6.523, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 9.003512171360999, | |
| "learning_rate": 1.8987854251012148e-05, | |
| "loss": 15.9062, | |
| "mean_token_accuracy": 0.5626139640808105, | |
| "num_tokens": 1731459.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.10931174089068826, | |
| "grad_norm": 8.045437012882436, | |
| "learning_rate": 1.894736842105263e-05, | |
| "loss": 14.9844, | |
| "mean_token_accuracy": 0.5767053365707397, | |
| "num_tokens": 1794413.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.11336032388663968, | |
| "grad_norm": 7.791818905787103, | |
| "learning_rate": 1.8906882591093117e-05, | |
| "loss": 16.0703, | |
| "mean_token_accuracy": 0.5646764636039734, | |
| "num_tokens": 1864000.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.11740890688259109, | |
| "grad_norm": 7.899247736502825, | |
| "learning_rate": 1.8866396761133607e-05, | |
| "loss": 14.6562, | |
| "mean_token_accuracy": 0.5848944187164307, | |
| "num_tokens": 1926306.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.1214574898785425, | |
| "grad_norm": 9.046927442368185, | |
| "learning_rate": 1.882591093117409e-05, | |
| "loss": 15.0781, | |
| "mean_token_accuracy": 0.5783066749572754, | |
| "num_tokens": 2001619.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1214574898785425, | |
| "eval_loss": 1.9221874475479126, | |
| "eval_mean_token_accuracy": 0.5738949328660965, | |
| "eval_num_tokens": 2001619.0, | |
| "eval_runtime": 0.6128, | |
| "eval_samples_per_second": 326.396, | |
| "eval_steps_per_second": 6.528, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12550607287449392, | |
| "grad_norm": 8.961465962055085, | |
| "learning_rate": 1.8785425101214576e-05, | |
| "loss": 16.1797, | |
| "mean_token_accuracy": 0.5587651133537292, | |
| "num_tokens": 2066109.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.12955465587044535, | |
| "grad_norm": 7.562379064010745, | |
| "learning_rate": 1.874493927125506e-05, | |
| "loss": 15.4922, | |
| "mean_token_accuracy": 0.5650998950004578, | |
| "num_tokens": 2142786.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.13360323886639677, | |
| "grad_norm": 8.562712496801407, | |
| "learning_rate": 1.8704453441295548e-05, | |
| "loss": 14.9219, | |
| "mean_token_accuracy": 0.5784920454025269, | |
| "num_tokens": 2199722.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.13765182186234817, | |
| "grad_norm": 9.988663916747383, | |
| "learning_rate": 1.8663967611336034e-05, | |
| "loss": 17.1641, | |
| "mean_token_accuracy": 0.5491723418235779, | |
| "num_tokens": 2275906.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.1417004048582996, | |
| "grad_norm": 7.709526019356417, | |
| "learning_rate": 1.862348178137652e-05, | |
| "loss": 15.7656, | |
| "mean_token_accuracy": 0.5621457099914551, | |
| "num_tokens": 2358888.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1417004048582996, | |
| "eval_loss": 1.9071874618530273, | |
| "eval_mean_token_accuracy": 0.5757417529821396, | |
| "eval_num_tokens": 2358888.0, | |
| "eval_runtime": 0.6136, | |
| "eval_samples_per_second": 325.959, | |
| "eval_steps_per_second": 6.519, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.145748987854251, | |
| "grad_norm": 8.03798824892872, | |
| "learning_rate": 1.8582995951417006e-05, | |
| "loss": 15.8047, | |
| "mean_token_accuracy": 0.5641842484474182, | |
| "num_tokens": 2428938.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.14979757085020243, | |
| "grad_norm": 8.32083756121802, | |
| "learning_rate": 1.8542510121457492e-05, | |
| "loss": 15.2578, | |
| "mean_token_accuracy": 0.5743291974067688, | |
| "num_tokens": 2493963.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 8.354287332937643, | |
| "learning_rate": 1.850202429149798e-05, | |
| "loss": 15.4141, | |
| "mean_token_accuracy": 0.5688024759292603, | |
| "num_tokens": 2559904.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 8.579778574791483, | |
| "learning_rate": 1.8461538461538465e-05, | |
| "loss": 15.5078, | |
| "mean_token_accuracy": 0.5710554122924805, | |
| "num_tokens": 2614334.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.16194331983805668, | |
| "grad_norm": 8.994514132460578, | |
| "learning_rate": 1.8421052631578947e-05, | |
| "loss": 14.1875, | |
| "mean_token_accuracy": 0.5983923673629761, | |
| "num_tokens": 2677721.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.16194331983805668, | |
| "eval_loss": 1.894374966621399, | |
| "eval_mean_token_accuracy": 0.577251747250557, | |
| "eval_num_tokens": 2677721.0, | |
| "eval_runtime": 0.6381, | |
| "eval_samples_per_second": 313.434, | |
| "eval_steps_per_second": 6.269, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1659919028340081, | |
| "grad_norm": 9.563884911162102, | |
| "learning_rate": 1.8380566801619433e-05, | |
| "loss": 15.8906, | |
| "mean_token_accuracy": 0.5601885914802551, | |
| "num_tokens": 2736542.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1700404858299595, | |
| "grad_norm": 8.720473365423832, | |
| "learning_rate": 1.8340080971659923e-05, | |
| "loss": 14.7969, | |
| "mean_token_accuracy": 0.5805999636650085, | |
| "num_tokens": 2795142.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.17408906882591094, | |
| "grad_norm": 7.269013839644517, | |
| "learning_rate": 1.8299595141700406e-05, | |
| "loss": 14.7031, | |
| "mean_token_accuracy": 0.578359067440033, | |
| "num_tokens": 2863022.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.17813765182186234, | |
| "grad_norm": 7.272014286398225, | |
| "learning_rate": 1.8259109311740892e-05, | |
| "loss": 14.8516, | |
| "mean_token_accuracy": 0.5769524574279785, | |
| "num_tokens": 2931180.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.18218623481781376, | |
| "grad_norm": 8.851418056520739, | |
| "learning_rate": 1.8218623481781378e-05, | |
| "loss": 15.3594, | |
| "mean_token_accuracy": 0.5745086669921875, | |
| "num_tokens": 2995713.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.18218623481781376, | |
| "eval_loss": 1.885312557220459, | |
| "eval_mean_token_accuracy": 0.5781335979700089, | |
| "eval_num_tokens": 2995713.0, | |
| "eval_runtime": 0.6226, | |
| "eval_samples_per_second": 321.249, | |
| "eval_steps_per_second": 6.425, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1862348178137652, | |
| "grad_norm": 8.587589121567573, | |
| "learning_rate": 1.8178137651821864e-05, | |
| "loss": 14.9531, | |
| "mean_token_accuracy": 0.5738093852996826, | |
| "num_tokens": 3066642.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1902834008097166, | |
| "grad_norm": 7.890409770531286, | |
| "learning_rate": 1.813765182186235e-05, | |
| "loss": 15.1016, | |
| "mean_token_accuracy": 0.5790749192237854, | |
| "num_tokens": 3128603.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.19433198380566802, | |
| "grad_norm": 7.793678892191199, | |
| "learning_rate": 1.8097165991902836e-05, | |
| "loss": 14.8281, | |
| "mean_token_accuracy": 0.5814535021781921, | |
| "num_tokens": 3187449.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.19838056680161945, | |
| "grad_norm": 7.256161611883218, | |
| "learning_rate": 1.805668016194332e-05, | |
| "loss": 14.9062, | |
| "mean_token_accuracy": 0.5790348649024963, | |
| "num_tokens": 3262113.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.20242914979757085, | |
| "grad_norm": 7.3121519125177254, | |
| "learning_rate": 1.801619433198381e-05, | |
| "loss": 14.6484, | |
| "mean_token_accuracy": 0.5866994261741638, | |
| "num_tokens": 3332956.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20242914979757085, | |
| "eval_loss": 1.8756250143051147, | |
| "eval_mean_token_accuracy": 0.5786750316619873, | |
| "eval_num_tokens": 3332956.0, | |
| "eval_runtime": 0.616, | |
| "eval_samples_per_second": 324.656, | |
| "eval_steps_per_second": 6.493, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20647773279352227, | |
| "grad_norm": 7.4168976628601095, | |
| "learning_rate": 1.7975708502024295e-05, | |
| "loss": 15.9141, | |
| "mean_token_accuracy": 0.5570866465568542, | |
| "num_tokens": 3402686.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 8.132782257191536, | |
| "learning_rate": 1.7935222672064778e-05, | |
| "loss": 13.6328, | |
| "mean_token_accuracy": 0.6026627421379089, | |
| "num_tokens": 3463439.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.2145748987854251, | |
| "grad_norm": 8.722536735498021, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 15.9922, | |
| "mean_token_accuracy": 0.5636070370674133, | |
| "num_tokens": 3534808.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.21862348178137653, | |
| "grad_norm": 9.5925704794369, | |
| "learning_rate": 1.785425101214575e-05, | |
| "loss": 16.5859, | |
| "mean_token_accuracy": 0.5475066900253296, | |
| "num_tokens": 3598221.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.22267206477732793, | |
| "grad_norm": 8.403059172595993, | |
| "learning_rate": 1.7813765182186236e-05, | |
| "loss": 15.0859, | |
| "mean_token_accuracy": 0.5766125321388245, | |
| "num_tokens": 3667399.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.22267206477732793, | |
| "eval_loss": 1.8668750524520874, | |
| "eval_mean_token_accuracy": 0.5796706676483154, | |
| "eval_num_tokens": 3667399.0, | |
| "eval_runtime": 0.6158, | |
| "eval_samples_per_second": 324.774, | |
| "eval_steps_per_second": 6.495, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.22672064777327935, | |
| "grad_norm": 7.854856510906883, | |
| "learning_rate": 1.7773279352226722e-05, | |
| "loss": 14.5781, | |
| "mean_token_accuracy": 0.5866025686264038, | |
| "num_tokens": 3741537.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.23076923076923078, | |
| "grad_norm": 9.026696458622231, | |
| "learning_rate": 1.7732793522267208e-05, | |
| "loss": 15.4062, | |
| "mean_token_accuracy": 0.5732284784317017, | |
| "num_tokens": 3817002.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.23481781376518218, | |
| "grad_norm": 8.576517147098428, | |
| "learning_rate": 1.7692307692307694e-05, | |
| "loss": 14.7969, | |
| "mean_token_accuracy": 0.577850878238678, | |
| "num_tokens": 3875145.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2388663967611336, | |
| "grad_norm": 9.62049525740993, | |
| "learning_rate": 1.765182186234818e-05, | |
| "loss": 14.9531, | |
| "mean_token_accuracy": 0.5829423069953918, | |
| "num_tokens": 3947880.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.242914979757085, | |
| "grad_norm": 7.546747277273659, | |
| "learning_rate": 1.7611336032388667e-05, | |
| "loss": 13.5547, | |
| "mean_token_accuracy": 0.6001456379890442, | |
| "num_tokens": 4015226.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.242914979757085, | |
| "eval_loss": 1.859375, | |
| "eval_mean_token_accuracy": 0.5801091939210892, | |
| "eval_num_tokens": 4015226.0, | |
| "eval_runtime": 0.6356, | |
| "eval_samples_per_second": 314.646, | |
| "eval_steps_per_second": 6.293, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24696356275303644, | |
| "grad_norm": 8.432094147883522, | |
| "learning_rate": 1.757085020242915e-05, | |
| "loss": 14.3438, | |
| "mean_token_accuracy": 0.5897209048271179, | |
| "num_tokens": 4076385.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.25101214574898784, | |
| "grad_norm": 8.727387350173785, | |
| "learning_rate": 1.7530364372469636e-05, | |
| "loss": 14.0859, | |
| "mean_token_accuracy": 0.5868391394615173, | |
| "num_tokens": 4130336.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2550607287449393, | |
| "grad_norm": 7.853461601699973, | |
| "learning_rate": 1.7489878542510125e-05, | |
| "loss": 14.5625, | |
| "mean_token_accuracy": 0.5877260565757751, | |
| "num_tokens": 4205714.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2591093117408907, | |
| "grad_norm": 10.618114952893986, | |
| "learning_rate": 1.7449392712550608e-05, | |
| "loss": 14.7891, | |
| "mean_token_accuracy": 0.5806258916854858, | |
| "num_tokens": 4276016.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 8.368519887093084, | |
| "learning_rate": 1.7408906882591094e-05, | |
| "loss": 14.6094, | |
| "mean_token_accuracy": 0.5852103233337402, | |
| "num_tokens": 4341166.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "eval_loss": 1.8528125286102295, | |
| "eval_mean_token_accuracy": 0.5809510350227356, | |
| "eval_num_tokens": 4341166.0, | |
| "eval_runtime": 0.6479, | |
| "eval_samples_per_second": 308.698, | |
| "eval_steps_per_second": 6.174, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.26720647773279355, | |
| "grad_norm": 8.961964950224539, | |
| "learning_rate": 1.736842105263158e-05, | |
| "loss": 13.7656, | |
| "mean_token_accuracy": 0.5963848829269409, | |
| "num_tokens": 4394174.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.27125506072874495, | |
| "grad_norm": 8.15926394629814, | |
| "learning_rate": 1.7327935222672066e-05, | |
| "loss": 14.8906, | |
| "mean_token_accuracy": 0.5760456323623657, | |
| "num_tokens": 4463144.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.27530364372469635, | |
| "grad_norm": 7.093290877100628, | |
| "learning_rate": 1.7287449392712552e-05, | |
| "loss": 14.375, | |
| "mean_token_accuracy": 0.5898444056510925, | |
| "num_tokens": 4533140.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.2793522267206478, | |
| "grad_norm": 8.262028322497853, | |
| "learning_rate": 1.724696356275304e-05, | |
| "loss": 15.3672, | |
| "mean_token_accuracy": 0.5701308250427246, | |
| "num_tokens": 4596279.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2834008097165992, | |
| "grad_norm": 7.614011800242887, | |
| "learning_rate": 1.720647773279352e-05, | |
| "loss": 13.5547, | |
| "mean_token_accuracy": 0.5994280576705933, | |
| "num_tokens": 4661033.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2834008097165992, | |
| "eval_loss": 1.8446874618530273, | |
| "eval_mean_token_accuracy": 0.5816306471824646, | |
| "eval_num_tokens": 4661033.0, | |
| "eval_runtime": 0.6219, | |
| "eval_samples_per_second": 321.57, | |
| "eval_steps_per_second": 6.431, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2874493927125506, | |
| "grad_norm": 8.501649392228297, | |
| "learning_rate": 1.716599190283401e-05, | |
| "loss": 14.5312, | |
| "mean_token_accuracy": 0.5782850384712219, | |
| "num_tokens": 4730708.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.291497975708502, | |
| "grad_norm": 7.022173928165628, | |
| "learning_rate": 1.7125506072874497e-05, | |
| "loss": 14.8203, | |
| "mean_token_accuracy": 0.5837699770927429, | |
| "num_tokens": 4804253.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.29554655870445345, | |
| "grad_norm": 7.489428149228862, | |
| "learning_rate": 1.708502024291498e-05, | |
| "loss": 13.9219, | |
| "mean_token_accuracy": 0.594904363155365, | |
| "num_tokens": 4873160.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.29959514170040485, | |
| "grad_norm": 8.856719704451924, | |
| "learning_rate": 1.7044534412955466e-05, | |
| "loss": 15.5781, | |
| "mean_token_accuracy": 0.5685590505599976, | |
| "num_tokens": 4942325.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.30364372469635625, | |
| "grad_norm": 7.197286431718064, | |
| "learning_rate": 1.7004048582995952e-05, | |
| "loss": 14.4922, | |
| "mean_token_accuracy": 0.5876879096031189, | |
| "num_tokens": 5014497.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.30364372469635625, | |
| "eval_loss": 1.8378125429153442, | |
| "eval_mean_token_accuracy": 0.5824010521173477, | |
| "eval_num_tokens": 5014497.0, | |
| "eval_runtime": 0.6338, | |
| "eval_samples_per_second": 315.564, | |
| "eval_steps_per_second": 6.311, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 9.04302504332728, | |
| "learning_rate": 1.6963562753036438e-05, | |
| "loss": 15.4844, | |
| "mean_token_accuracy": 0.571445107460022, | |
| "num_tokens": 5075118.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.3117408906882591, | |
| "grad_norm": 7.558608724226377, | |
| "learning_rate": 1.6923076923076924e-05, | |
| "loss": 14.5156, | |
| "mean_token_accuracy": 0.5831646919250488, | |
| "num_tokens": 5142386.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 7.8941192536661955, | |
| "learning_rate": 1.688259109311741e-05, | |
| "loss": 13.6406, | |
| "mean_token_accuracy": 0.5994213223457336, | |
| "num_tokens": 5208462.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.31983805668016196, | |
| "grad_norm": 7.560860397902134, | |
| "learning_rate": 1.6842105263157896e-05, | |
| "loss": 15.3047, | |
| "mean_token_accuracy": 0.5730275511741638, | |
| "num_tokens": 5285004.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.32388663967611336, | |
| "grad_norm": 7.257123559107278, | |
| "learning_rate": 1.6801619433198383e-05, | |
| "loss": 14.5312, | |
| "mean_token_accuracy": 0.586321234703064, | |
| "num_tokens": 5349606.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32388663967611336, | |
| "eval_loss": 1.8290624618530273, | |
| "eval_mean_token_accuracy": 0.583142414689064, | |
| "eval_num_tokens": 5349606.0, | |
| "eval_runtime": 0.6206, | |
| "eval_samples_per_second": 322.278, | |
| "eval_steps_per_second": 6.446, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32793522267206476, | |
| "grad_norm": 8.385990372203198, | |
| "learning_rate": 1.676113360323887e-05, | |
| "loss": 15.8828, | |
| "mean_token_accuracy": 0.566261887550354, | |
| "num_tokens": 5413817.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.3319838056680162, | |
| "grad_norm": 8.40823722369304, | |
| "learning_rate": 1.672064777327935e-05, | |
| "loss": 15.5781, | |
| "mean_token_accuracy": 0.5658481121063232, | |
| "num_tokens": 5487505.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3360323886639676, | |
| "grad_norm": 8.204736370997345, | |
| "learning_rate": 1.6680161943319838e-05, | |
| "loss": 13.2969, | |
| "mean_token_accuracy": 0.6081260442733765, | |
| "num_tokens": 5549321.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.340080971659919, | |
| "grad_norm": 7.713082914361621, | |
| "learning_rate": 1.6639676113360327e-05, | |
| "loss": 14.375, | |
| "mean_token_accuracy": 0.5855867266654968, | |
| "num_tokens": 5612660.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.3441295546558704, | |
| "grad_norm": 9.49322308557619, | |
| "learning_rate": 1.659919028340081e-05, | |
| "loss": 15.6953, | |
| "mean_token_accuracy": 0.5663869380950928, | |
| "num_tokens": 5685773.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3441295546558704, | |
| "eval_loss": 1.8246874809265137, | |
| "eval_mean_token_accuracy": 0.5833836942911148, | |
| "eval_num_tokens": 5685773.0, | |
| "eval_runtime": 0.6299, | |
| "eval_samples_per_second": 317.535, | |
| "eval_steps_per_second": 6.351, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3481781376518219, | |
| "grad_norm": 8.979306474955873, | |
| "learning_rate": 1.6558704453441296e-05, | |
| "loss": 14.8125, | |
| "mean_token_accuracy": 0.5798248648643494, | |
| "num_tokens": 5746238.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3522267206477733, | |
| "grad_norm": 7.582176162306731, | |
| "learning_rate": 1.6518218623481782e-05, | |
| "loss": 15.5625, | |
| "mean_token_accuracy": 0.5640476942062378, | |
| "num_tokens": 5814993.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3562753036437247, | |
| "grad_norm": 8.163709779919543, | |
| "learning_rate": 1.6477732793522268e-05, | |
| "loss": 15.125, | |
| "mean_token_accuracy": 0.5744800567626953, | |
| "num_tokens": 5884344.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3603238866396761, | |
| "grad_norm": 7.597205625950287, | |
| "learning_rate": 1.6437246963562754e-05, | |
| "loss": 15.1172, | |
| "mean_token_accuracy": 0.5687373876571655, | |
| "num_tokens": 5954319.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.3643724696356275, | |
| "grad_norm": 7.2482847160987625, | |
| "learning_rate": 1.639676113360324e-05, | |
| "loss": 13.2422, | |
| "mean_token_accuracy": 0.6023260354995728, | |
| "num_tokens": 6019645.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3643724696356275, | |
| "eval_loss": 1.8174999952316284, | |
| "eval_mean_token_accuracy": 0.5845800191164017, | |
| "eval_num_tokens": 6019645.0, | |
| "eval_runtime": 0.6235, | |
| "eval_samples_per_second": 320.756, | |
| "eval_steps_per_second": 6.415, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 8.841871793018052, | |
| "learning_rate": 1.6356275303643723e-05, | |
| "loss": 13.7266, | |
| "mean_token_accuracy": 0.6017288565635681, | |
| "num_tokens": 6075352.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3724696356275304, | |
| "grad_norm": 8.434804871321719, | |
| "learning_rate": 1.6315789473684213e-05, | |
| "loss": 14.0547, | |
| "mean_token_accuracy": 0.5935065746307373, | |
| "num_tokens": 6145764.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.3765182186234818, | |
| "grad_norm": 7.730347182665673, | |
| "learning_rate": 1.62753036437247e-05, | |
| "loss": 14.6641, | |
| "mean_token_accuracy": 0.5870702862739563, | |
| "num_tokens": 6210408.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3805668016194332, | |
| "grad_norm": 7.45273814710918, | |
| "learning_rate": 1.6234817813765185e-05, | |
| "loss": 14.2422, | |
| "mean_token_accuracy": 0.5905075669288635, | |
| "num_tokens": 6279074.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 11.1983859182283, | |
| "learning_rate": 1.6194331983805668e-05, | |
| "loss": 15.3906, | |
| "mean_token_accuracy": 0.5667217373847961, | |
| "num_tokens": 6348733.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "eval_loss": 1.8106249570846558, | |
| "eval_mean_token_accuracy": 0.5847904682159424, | |
| "eval_num_tokens": 6348733.0, | |
| "eval_runtime": 0.6596, | |
| "eval_samples_per_second": 303.232, | |
| "eval_steps_per_second": 6.065, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.38866396761133604, | |
| "grad_norm": 6.824860263185756, | |
| "learning_rate": 1.6153846153846154e-05, | |
| "loss": 13.8672, | |
| "mean_token_accuracy": 0.5981316566467285, | |
| "num_tokens": 6418698.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.39271255060728744, | |
| "grad_norm": 7.431961140378814, | |
| "learning_rate": 1.6113360323886644e-05, | |
| "loss": 15.1719, | |
| "mean_token_accuracy": 0.5711833834648132, | |
| "num_tokens": 6488097.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3967611336032389, | |
| "grad_norm": 8.677956082707139, | |
| "learning_rate": 1.6072874493927126e-05, | |
| "loss": 14.4922, | |
| "mean_token_accuracy": 0.585637092590332, | |
| "num_tokens": 6549764.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.4008097165991903, | |
| "grad_norm": 8.646021653592204, | |
| "learning_rate": 1.6032388663967612e-05, | |
| "loss": 14.75, | |
| "mean_token_accuracy": 0.5855600833892822, | |
| "num_tokens": 6609482.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.4048582995951417, | |
| "grad_norm": 7.907811817137269, | |
| "learning_rate": 1.59919028340081e-05, | |
| "loss": 15.0469, | |
| "mean_token_accuracy": 0.5750304460525513, | |
| "num_tokens": 6679311.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4048582995951417, | |
| "eval_loss": 1.807187557220459, | |
| "eval_mean_token_accuracy": 0.585128903388977, | |
| "eval_num_tokens": 6679311.0, | |
| "eval_runtime": 0.6262, | |
| "eval_samples_per_second": 319.397, | |
| "eval_steps_per_second": 6.388, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4089068825910931, | |
| "grad_norm": 7.835498999161377, | |
| "learning_rate": 1.5951417004048585e-05, | |
| "loss": 15.0156, | |
| "mean_token_accuracy": 0.5781442523002625, | |
| "num_tokens": 6736916.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.41295546558704455, | |
| "grad_norm": 7.972192619348582, | |
| "learning_rate": 1.591093117408907e-05, | |
| "loss": 14.3438, | |
| "mean_token_accuracy": 0.590062141418457, | |
| "num_tokens": 6802185.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.41700404858299595, | |
| "grad_norm": 7.983491081447614, | |
| "learning_rate": 1.5870445344129557e-05, | |
| "loss": 14.75, | |
| "mean_token_accuracy": 0.579732358455658, | |
| "num_tokens": 6863598.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 10.01408214824878, | |
| "learning_rate": 1.582995951417004e-05, | |
| "loss": 14.9219, | |
| "mean_token_accuracy": 0.5742595791816711, | |
| "num_tokens": 6927748.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.4251012145748988, | |
| "grad_norm": 7.779179257118591, | |
| "learning_rate": 1.578947368421053e-05, | |
| "loss": 13.8125, | |
| "mean_token_accuracy": 0.5934587121009827, | |
| "num_tokens": 6993579.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4251012145748988, | |
| "eval_loss": 1.8018749952316284, | |
| "eval_mean_token_accuracy": 0.5855304300785065, | |
| "eval_num_tokens": 6993579.0, | |
| "eval_runtime": 0.624, | |
| "eval_samples_per_second": 320.53, | |
| "eval_steps_per_second": 6.411, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4291497975708502, | |
| "grad_norm": 7.766216017833367, | |
| "learning_rate": 1.5748987854251015e-05, | |
| "loss": 14.9453, | |
| "mean_token_accuracy": 0.5719259977340698, | |
| "num_tokens": 7064007.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.4331983805668016, | |
| "grad_norm": 8.87590967566746, | |
| "learning_rate": 1.5708502024291498e-05, | |
| "loss": 14.1328, | |
| "mean_token_accuracy": 0.5912887454032898, | |
| "num_tokens": 7126932.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.43724696356275305, | |
| "grad_norm": 7.2824506866551015, | |
| "learning_rate": 1.5668016194331984e-05, | |
| "loss": 12.9688, | |
| "mean_token_accuracy": 0.6094680428504944, | |
| "num_tokens": 7188297.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.44129554655870445, | |
| "grad_norm": 8.281692416649847, | |
| "learning_rate": 1.562753036437247e-05, | |
| "loss": 14.7578, | |
| "mean_token_accuracy": 0.5806804895401001, | |
| "num_tokens": 7252286.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.44534412955465585, | |
| "grad_norm": 7.710589034957108, | |
| "learning_rate": 1.5587044534412957e-05, | |
| "loss": 13.9375, | |
| "mean_token_accuracy": 0.5891672968864441, | |
| "num_tokens": 7328970.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.44534412955465585, | |
| "eval_loss": 1.7962499856948853, | |
| "eval_mean_token_accuracy": 0.5861413329839706, | |
| "eval_num_tokens": 7328970.0, | |
| "eval_runtime": 0.6541, | |
| "eval_samples_per_second": 305.776, | |
| "eval_steps_per_second": 6.116, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4493927125506073, | |
| "grad_norm": 9.932234036882006, | |
| "learning_rate": 1.5546558704453443e-05, | |
| "loss": 14.9844, | |
| "mean_token_accuracy": 0.583306074142456, | |
| "num_tokens": 7396220.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.4534412955465587, | |
| "grad_norm": 7.818592875809941, | |
| "learning_rate": 1.550607287449393e-05, | |
| "loss": 14.6562, | |
| "mean_token_accuracy": 0.5830007195472717, | |
| "num_tokens": 7454663.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.4574898785425101, | |
| "grad_norm": 8.077045865068051, | |
| "learning_rate": 1.5465587044534415e-05, | |
| "loss": 15.1641, | |
| "mean_token_accuracy": 0.5737788677215576, | |
| "num_tokens": 7515205.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 8.172258229119766, | |
| "learning_rate": 1.54251012145749e-05, | |
| "loss": 13.7969, | |
| "mean_token_accuracy": 0.5884114503860474, | |
| "num_tokens": 7571704.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.46558704453441296, | |
| "grad_norm": 7.57672126802984, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 13.6719, | |
| "mean_token_accuracy": 0.599338173866272, | |
| "num_tokens": 7634020.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.46558704453441296, | |
| "eval_loss": 1.7934374809265137, | |
| "eval_mean_token_accuracy": 0.5860239416360855, | |
| "eval_num_tokens": 7634020.0, | |
| "eval_runtime": 0.6265, | |
| "eval_samples_per_second": 319.23, | |
| "eval_steps_per_second": 6.385, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.46963562753036436, | |
| "grad_norm": 7.229443663353096, | |
| "learning_rate": 1.534412955465587e-05, | |
| "loss": 13.4922, | |
| "mean_token_accuracy": 0.5976777672767639, | |
| "num_tokens": 7702897.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 8.594596623758976, | |
| "learning_rate": 1.5303643724696356e-05, | |
| "loss": 13.7969, | |
| "mean_token_accuracy": 0.595205545425415, | |
| "num_tokens": 7761445.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.4777327935222672, | |
| "grad_norm": 7.44049680844727, | |
| "learning_rate": 1.5263157894736846e-05, | |
| "loss": 14.4688, | |
| "mean_token_accuracy": 0.5831844210624695, | |
| "num_tokens": 7829220.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.4817813765182186, | |
| "grad_norm": 8.010264997343702, | |
| "learning_rate": 1.5222672064777328e-05, | |
| "loss": 14.3516, | |
| "mean_token_accuracy": 0.5925936698913574, | |
| "num_tokens": 7897361.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.48582995951417, | |
| "grad_norm": 9.11202877002977, | |
| "learning_rate": 1.5182186234817814e-05, | |
| "loss": 15.5078, | |
| "mean_token_accuracy": 0.5689979195594788, | |
| "num_tokens": 7958666.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.48582995951417, | |
| "eval_loss": 1.786562442779541, | |
| "eval_mean_token_accuracy": 0.587049126625061, | |
| "eval_num_tokens": 7958666.0, | |
| "eval_runtime": 0.6451, | |
| "eval_samples_per_second": 310.009, | |
| "eval_steps_per_second": 6.2, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4898785425101215, | |
| "grad_norm": 8.733802509491014, | |
| "learning_rate": 1.5141700404858302e-05, | |
| "loss": 13.9922, | |
| "mean_token_accuracy": 0.591108500957489, | |
| "num_tokens": 8031901.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.4939271255060729, | |
| "grad_norm": 7.43499030469072, | |
| "learning_rate": 1.5101214574898787e-05, | |
| "loss": 13.6719, | |
| "mean_token_accuracy": 0.6017109155654907, | |
| "num_tokens": 8105143.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.4979757085020243, | |
| "grad_norm": 7.117038860943061, | |
| "learning_rate": 1.5060728744939273e-05, | |
| "loss": 14.5234, | |
| "mean_token_accuracy": 0.580233633518219, | |
| "num_tokens": 8177876.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5020242914979757, | |
| "grad_norm": 7.746503576535467, | |
| "learning_rate": 1.5020242914979759e-05, | |
| "loss": 14.6797, | |
| "mean_token_accuracy": 0.5798138380050659, | |
| "num_tokens": 8246274.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5060728744939271, | |
| "grad_norm": 7.532018226813427, | |
| "learning_rate": 1.4979757085020243e-05, | |
| "loss": 14.1562, | |
| "mean_token_accuracy": 0.582686722278595, | |
| "num_tokens": 8315418.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5060728744939271, | |
| "eval_loss": 1.7821874618530273, | |
| "eval_mean_token_accuracy": 0.5875359177589417, | |
| "eval_num_tokens": 8315418.0, | |
| "eval_runtime": 0.6323, | |
| "eval_samples_per_second": 316.295, | |
| "eval_steps_per_second": 6.326, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5101214574898786, | |
| "grad_norm": 8.864364235981503, | |
| "learning_rate": 1.493927125506073e-05, | |
| "loss": 14.4688, | |
| "mean_token_accuracy": 0.5993614792823792, | |
| "num_tokens": 8372174.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5141700404858299, | |
| "grad_norm": 8.95321820232742, | |
| "learning_rate": 1.4898785425101216e-05, | |
| "loss": 14.3438, | |
| "mean_token_accuracy": 0.5901056528091431, | |
| "num_tokens": 8441146.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5182186234817814, | |
| "grad_norm": 7.263835476936067, | |
| "learning_rate": 1.48582995951417e-05, | |
| "loss": 13.9688, | |
| "mean_token_accuracy": 0.594972550868988, | |
| "num_tokens": 8511862.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5222672064777328, | |
| "grad_norm": 9.946116238413886, | |
| "learning_rate": 1.4817813765182188e-05, | |
| "loss": 14.6953, | |
| "mean_token_accuracy": 0.5761271119117737, | |
| "num_tokens": 8579819.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 7.617051573886364, | |
| "learning_rate": 1.4777327935222674e-05, | |
| "loss": 12.9141, | |
| "mean_token_accuracy": 0.6136053800582886, | |
| "num_tokens": 8648885.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "eval_loss": 1.7781250476837158, | |
| "eval_mean_token_accuracy": 0.5874989777803421, | |
| "eval_num_tokens": 8648885.0, | |
| "eval_runtime": 0.6456, | |
| "eval_samples_per_second": 309.786, | |
| "eval_steps_per_second": 6.196, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5303643724696356, | |
| "grad_norm": 7.995063941880695, | |
| "learning_rate": 1.4736842105263159e-05, | |
| "loss": 14.2891, | |
| "mean_token_accuracy": 0.5915219187736511, | |
| "num_tokens": 8716889.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5344129554655871, | |
| "grad_norm": 7.9292860672813665, | |
| "learning_rate": 1.4696356275303645e-05, | |
| "loss": 13.5078, | |
| "mean_token_accuracy": 0.6026769280433655, | |
| "num_tokens": 8777395.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.5384615384615384, | |
| "grad_norm": 7.586666671628873, | |
| "learning_rate": 1.465587044534413e-05, | |
| "loss": 14.8672, | |
| "mean_token_accuracy": 0.5739372372627258, | |
| "num_tokens": 8848242.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5425101214574899, | |
| "grad_norm": 8.426747584830805, | |
| "learning_rate": 1.4615384615384615e-05, | |
| "loss": 14.6484, | |
| "mean_token_accuracy": 0.5826023817062378, | |
| "num_tokens": 8910211.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5465587044534413, | |
| "grad_norm": 8.34819038901423, | |
| "learning_rate": 1.4574898785425103e-05, | |
| "loss": 15.3672, | |
| "mean_token_accuracy": 0.5670615434646606, | |
| "num_tokens": 8976714.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5465587044534413, | |
| "eval_loss": 1.770937442779541, | |
| "eval_mean_token_accuracy": 0.5890246480703354, | |
| "eval_num_tokens": 8976714.0, | |
| "eval_runtime": 0.627, | |
| "eval_samples_per_second": 318.954, | |
| "eval_steps_per_second": 6.379, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5506072874493927, | |
| "grad_norm": 7.635072916228879, | |
| "learning_rate": 1.453441295546559e-05, | |
| "loss": 13.5703, | |
| "mean_token_accuracy": 0.5985166430473328, | |
| "num_tokens": 9032733.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.5546558704453441, | |
| "grad_norm": 7.13210528504054, | |
| "learning_rate": 1.4493927125506074e-05, | |
| "loss": 14.0469, | |
| "mean_token_accuracy": 0.5941102504730225, | |
| "num_tokens": 9106960.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.5587044534412956, | |
| "grad_norm": 7.472585203507861, | |
| "learning_rate": 1.445344129554656e-05, | |
| "loss": 13.4141, | |
| "mean_token_accuracy": 0.6021690964698792, | |
| "num_tokens": 9171843.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.562753036437247, | |
| "grad_norm": 8.097767778322229, | |
| "learning_rate": 1.4412955465587046e-05, | |
| "loss": 14.8047, | |
| "mean_token_accuracy": 0.578671932220459, | |
| "num_tokens": 9238411.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.5668016194331984, | |
| "grad_norm": 8.339563519464773, | |
| "learning_rate": 1.437246963562753e-05, | |
| "loss": 14.6484, | |
| "mean_token_accuracy": 0.5821104645729065, | |
| "num_tokens": 9300623.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5668016194331984, | |
| "eval_loss": 1.7675000429153442, | |
| "eval_mean_token_accuracy": 0.5898215025663376, | |
| "eval_num_tokens": 9300623.0, | |
| "eval_runtime": 0.6311, | |
| "eval_samples_per_second": 316.883, | |
| "eval_steps_per_second": 6.338, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5708502024291497, | |
| "grad_norm": 7.211581007077111, | |
| "learning_rate": 1.4331983805668017e-05, | |
| "loss": 14.1094, | |
| "mean_token_accuracy": 0.5875740051269531, | |
| "num_tokens": 9364199.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.5748987854251012, | |
| "grad_norm": 8.547180559454508, | |
| "learning_rate": 1.4291497975708504e-05, | |
| "loss": 13.3594, | |
| "mean_token_accuracy": 0.6009641885757446, | |
| "num_tokens": 9427112.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 8.314592250089833, | |
| "learning_rate": 1.4251012145748989e-05, | |
| "loss": 13.6719, | |
| "mean_token_accuracy": 0.5959135890007019, | |
| "num_tokens": 9493444.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.582995951417004, | |
| "grad_norm": 8.353437279154521, | |
| "learning_rate": 1.4210526315789475e-05, | |
| "loss": 14.4453, | |
| "mean_token_accuracy": 0.5817537307739258, | |
| "num_tokens": 9563780.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5870445344129555, | |
| "grad_norm": 7.105730625315736, | |
| "learning_rate": 1.4170040485829961e-05, | |
| "loss": 13.8828, | |
| "mean_token_accuracy": 0.5948812961578369, | |
| "num_tokens": 9634643.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5870445344129555, | |
| "eval_loss": 1.764062523841858, | |
| "eval_mean_token_accuracy": 0.589348778128624, | |
| "eval_num_tokens": 9634643.0, | |
| "eval_runtime": 0.6326, | |
| "eval_samples_per_second": 316.156, | |
| "eval_steps_per_second": 6.323, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5910931174089069, | |
| "grad_norm": 8.85529190632474, | |
| "learning_rate": 1.4129554655870446e-05, | |
| "loss": 14.9609, | |
| "mean_token_accuracy": 0.5746488571166992, | |
| "num_tokens": 9705552.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.5951417004048583, | |
| "grad_norm": 7.404634749249628, | |
| "learning_rate": 1.4089068825910932e-05, | |
| "loss": 14.1562, | |
| "mean_token_accuracy": 0.5874021053314209, | |
| "num_tokens": 9784121.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5991902834008097, | |
| "grad_norm": 8.414339407163222, | |
| "learning_rate": 1.4048582995951418e-05, | |
| "loss": 14.2266, | |
| "mean_token_accuracy": 0.5838397741317749, | |
| "num_tokens": 9855434.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6032388663967612, | |
| "grad_norm": 8.385715636253362, | |
| "learning_rate": 1.4008097165991902e-05, | |
| "loss": 14.9531, | |
| "mean_token_accuracy": 0.580719530582428, | |
| "num_tokens": 9932270.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6072874493927125, | |
| "grad_norm": 8.437220430570475, | |
| "learning_rate": 1.396761133603239e-05, | |
| "loss": 13.5312, | |
| "mean_token_accuracy": 0.6000396013259888, | |
| "num_tokens": 9987882.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6072874493927125, | |
| "eval_loss": 1.7596875429153442, | |
| "eval_mean_token_accuracy": 0.592434361577034, | |
| "eval_num_tokens": 9987882.0, | |
| "eval_runtime": 0.6411, | |
| "eval_samples_per_second": 311.96, | |
| "eval_steps_per_second": 6.239, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.611336032388664, | |
| "grad_norm": 7.3392719089419, | |
| "learning_rate": 1.3927125506072876e-05, | |
| "loss": 14.1953, | |
| "mean_token_accuracy": 0.5917255878448486, | |
| "num_tokens": 10053860.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 7.106128585719554, | |
| "learning_rate": 1.388663967611336e-05, | |
| "loss": 13.9062, | |
| "mean_token_accuracy": 0.5945723652839661, | |
| "num_tokens": 10118446.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6194331983805668, | |
| "grad_norm": 8.771091973478418, | |
| "learning_rate": 1.3846153846153847e-05, | |
| "loss": 13.6953, | |
| "mean_token_accuracy": 0.5965032577514648, | |
| "num_tokens": 10185658.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6234817813765182, | |
| "grad_norm": 7.463408517295952, | |
| "learning_rate": 1.3805668016194333e-05, | |
| "loss": 14.5234, | |
| "mean_token_accuracy": 0.5856794118881226, | |
| "num_tokens": 10256194.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.6275303643724697, | |
| "grad_norm": 7.766925275194455, | |
| "learning_rate": 1.3765182186234817e-05, | |
| "loss": 14.4844, | |
| "mean_token_accuracy": 0.5821614861488342, | |
| "num_tokens": 10322676.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6275303643724697, | |
| "eval_loss": 1.7568750381469727, | |
| "eval_mean_token_accuracy": 0.593303382396698, | |
| "eval_num_tokens": 10322676.0, | |
| "eval_runtime": 0.6285, | |
| "eval_samples_per_second": 318.24, | |
| "eval_steps_per_second": 6.365, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 11.730558732427344, | |
| "learning_rate": 1.3724696356275305e-05, | |
| "loss": 14.1797, | |
| "mean_token_accuracy": 0.590240478515625, | |
| "num_tokens": 10390182.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6356275303643725, | |
| "grad_norm": 6.962088111992022, | |
| "learning_rate": 1.3684210526315791e-05, | |
| "loss": 14.6328, | |
| "mean_token_accuracy": 0.5875160694122314, | |
| "num_tokens": 10482669.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6396761133603239, | |
| "grad_norm": 7.342884429279778, | |
| "learning_rate": 1.3643724696356277e-05, | |
| "loss": 13.4531, | |
| "mean_token_accuracy": 0.6036462783813477, | |
| "num_tokens": 10543343.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.6437246963562753, | |
| "grad_norm": 9.243698402724284, | |
| "learning_rate": 1.3603238866396762e-05, | |
| "loss": 14.3438, | |
| "mean_token_accuracy": 0.5891345739364624, | |
| "num_tokens": 10605678.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.6477732793522267, | |
| "grad_norm": 6.586652944449273, | |
| "learning_rate": 1.3562753036437248e-05, | |
| "loss": 13.6094, | |
| "mean_token_accuracy": 0.6069329977035522, | |
| "num_tokens": 10678928.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6477732793522267, | |
| "eval_loss": 1.751562476158142, | |
| "eval_mean_token_accuracy": 0.5941518694162369, | |
| "eval_num_tokens": 10678928.0, | |
| "eval_runtime": 0.6304, | |
| "eval_samples_per_second": 317.246, | |
| "eval_steps_per_second": 6.345, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6518218623481782, | |
| "grad_norm": 7.292010909500028, | |
| "learning_rate": 1.3522267206477734e-05, | |
| "loss": 15.5391, | |
| "mean_token_accuracy": 0.5630365014076233, | |
| "num_tokens": 10752497.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.6558704453441295, | |
| "grad_norm": 8.517901596352386, | |
| "learning_rate": 1.3481781376518219e-05, | |
| "loss": 13.8203, | |
| "mean_token_accuracy": 0.606192409992218, | |
| "num_tokens": 10808888.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.659919028340081, | |
| "grad_norm": 7.047539986657361, | |
| "learning_rate": 1.3441295546558706e-05, | |
| "loss": 12.6094, | |
| "mean_token_accuracy": 0.6234721541404724, | |
| "num_tokens": 10867941.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.6639676113360324, | |
| "grad_norm": 8.193594708311382, | |
| "learning_rate": 1.3400809716599193e-05, | |
| "loss": 15.1562, | |
| "mean_token_accuracy": 0.5752149224281311, | |
| "num_tokens": 10945011.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.6680161943319838, | |
| "grad_norm": 7.0437604094189386, | |
| "learning_rate": 1.3360323886639677e-05, | |
| "loss": 13.1094, | |
| "mean_token_accuracy": 0.6154734492301941, | |
| "num_tokens": 11020417.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6680161943319838, | |
| "eval_loss": 1.7496875524520874, | |
| "eval_mean_token_accuracy": 0.5943519324064255, | |
| "eval_num_tokens": 11020417.0, | |
| "eval_runtime": 0.6499, | |
| "eval_samples_per_second": 307.758, | |
| "eval_steps_per_second": 6.155, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6720647773279352, | |
| "grad_norm": 7.138901056236762, | |
| "learning_rate": 1.3319838056680163e-05, | |
| "loss": 13.8594, | |
| "mean_token_accuracy": 0.5956168174743652, | |
| "num_tokens": 11089883.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.6761133603238867, | |
| "grad_norm": 8.201709305900888, | |
| "learning_rate": 1.327935222672065e-05, | |
| "loss": 13.7344, | |
| "mean_token_accuracy": 0.5976811051368713, | |
| "num_tokens": 11147819.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.680161943319838, | |
| "grad_norm": 7.8526020934287954, | |
| "learning_rate": 1.3238866396761134e-05, | |
| "loss": 13.9766, | |
| "mean_token_accuracy": 0.5927841067314148, | |
| "num_tokens": 11219059.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 7.318409427810135, | |
| "learning_rate": 1.3198380566801622e-05, | |
| "loss": 14.3281, | |
| "mean_token_accuracy": 0.5857775211334229, | |
| "num_tokens": 11278072.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6882591093117408, | |
| "grad_norm": 7.389155589413203, | |
| "learning_rate": 1.3157894736842108e-05, | |
| "loss": 14.4297, | |
| "mean_token_accuracy": 0.587368905544281, | |
| "num_tokens": 11351732.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6882591093117408, | |
| "eval_loss": 1.746250033378601, | |
| "eval_mean_token_accuracy": 0.5935623943805695, | |
| "eval_num_tokens": 11351732.0, | |
| "eval_runtime": 0.6314, | |
| "eval_samples_per_second": 316.758, | |
| "eval_steps_per_second": 6.335, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6923076923076923, | |
| "grad_norm": 8.124816197332432, | |
| "learning_rate": 1.3117408906882592e-05, | |
| "loss": 14.1172, | |
| "mean_token_accuracy": 0.5922242999076843, | |
| "num_tokens": 11407714.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.6963562753036437, | |
| "grad_norm": 7.965720782927913, | |
| "learning_rate": 1.3076923076923078e-05, | |
| "loss": 13.4609, | |
| "mean_token_accuracy": 0.6015889048576355, | |
| "num_tokens": 11484184.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7004048582995951, | |
| "grad_norm": 8.915221369950393, | |
| "learning_rate": 1.3036437246963564e-05, | |
| "loss": 14.1875, | |
| "mean_token_accuracy": 0.5905665755271912, | |
| "num_tokens": 11544099.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7044534412955465, | |
| "grad_norm": 8.790247334783299, | |
| "learning_rate": 1.2995951417004049e-05, | |
| "loss": 13.9219, | |
| "mean_token_accuracy": 0.5985158681869507, | |
| "num_tokens": 11606691.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.708502024291498, | |
| "grad_norm": 9.868245626915737, | |
| "learning_rate": 1.2955465587044535e-05, | |
| "loss": 13.6172, | |
| "mean_token_accuracy": 0.5943373441696167, | |
| "num_tokens": 11672625.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.708502024291498, | |
| "eval_loss": 1.7431249618530273, | |
| "eval_mean_token_accuracy": 0.5946680456399918, | |
| "eval_num_tokens": 11672625.0, | |
| "eval_runtime": 0.6432, | |
| "eval_samples_per_second": 310.949, | |
| "eval_steps_per_second": 6.219, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7125506072874493, | |
| "grad_norm": 8.884030860368474, | |
| "learning_rate": 1.2914979757085023e-05, | |
| "loss": 14.0, | |
| "mean_token_accuracy": 0.5949345827102661, | |
| "num_tokens": 11732625.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7165991902834008, | |
| "grad_norm": 9.007310004782914, | |
| "learning_rate": 1.2874493927125507e-05, | |
| "loss": 14.0156, | |
| "mean_token_accuracy": 0.6007354259490967, | |
| "num_tokens": 11801766.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.7206477732793523, | |
| "grad_norm": 8.33208496329059, | |
| "learning_rate": 1.2834008097165993e-05, | |
| "loss": 14.8281, | |
| "mean_token_accuracy": 0.5790393352508545, | |
| "num_tokens": 11863641.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7246963562753036, | |
| "grad_norm": 7.426550244631567, | |
| "learning_rate": 1.279352226720648e-05, | |
| "loss": 14.5625, | |
| "mean_token_accuracy": 0.5841023325920105, | |
| "num_tokens": 11933275.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.728744939271255, | |
| "grad_norm": 6.8034373170968525, | |
| "learning_rate": 1.2753036437246964e-05, | |
| "loss": 14.6562, | |
| "mean_token_accuracy": 0.5787296295166016, | |
| "num_tokens": 12017691.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.728744939271255, | |
| "eval_loss": 1.7400000095367432, | |
| "eval_mean_token_accuracy": 0.5955071151256561, | |
| "eval_num_tokens": 12017691.0, | |
| "eval_runtime": 0.6493, | |
| "eval_samples_per_second": 308.043, | |
| "eval_steps_per_second": 6.161, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7327935222672065, | |
| "grad_norm": 7.057663809648343, | |
| "learning_rate": 1.271255060728745e-05, | |
| "loss": 13.6875, | |
| "mean_token_accuracy": 0.5996446013450623, | |
| "num_tokens": 12079093.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 7.371289124958891, | |
| "learning_rate": 1.2672064777327936e-05, | |
| "loss": 14.5234, | |
| "mean_token_accuracy": 0.5833817720413208, | |
| "num_tokens": 12160008.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.7408906882591093, | |
| "grad_norm": 8.707687219264766, | |
| "learning_rate": 1.263157894736842e-05, | |
| "loss": 14.2266, | |
| "mean_token_accuracy": 0.5921141505241394, | |
| "num_tokens": 12229741.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.7449392712550608, | |
| "grad_norm": 7.516841864927085, | |
| "learning_rate": 1.2591093117408908e-05, | |
| "loss": 14.7891, | |
| "mean_token_accuracy": 0.5818018913269043, | |
| "num_tokens": 12301362.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.7489878542510121, | |
| "grad_norm": 7.6633569797452346, | |
| "learning_rate": 1.2550607287449395e-05, | |
| "loss": 13.6406, | |
| "mean_token_accuracy": 0.6013834476470947, | |
| "num_tokens": 12370239.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7489878542510121, | |
| "eval_loss": 1.7356250286102295, | |
| "eval_mean_token_accuracy": 0.5960101634263992, | |
| "eval_num_tokens": 12370239.0, | |
| "eval_runtime": 0.6504, | |
| "eval_samples_per_second": 307.492, | |
| "eval_steps_per_second": 6.15, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7530364372469636, | |
| "grad_norm": 6.968835438956393, | |
| "learning_rate": 1.2510121457489879e-05, | |
| "loss": 13.5625, | |
| "mean_token_accuracy": 0.6022695302963257, | |
| "num_tokens": 12441507.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.757085020242915, | |
| "grad_norm": 7.195791337914372, | |
| "learning_rate": 1.2469635627530365e-05, | |
| "loss": 13.4609, | |
| "mean_token_accuracy": 0.6029326915740967, | |
| "num_tokens": 12502266.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.7611336032388664, | |
| "grad_norm": 7.668677247147722, | |
| "learning_rate": 1.2429149797570851e-05, | |
| "loss": 13.8047, | |
| "mean_token_accuracy": 0.5992096662521362, | |
| "num_tokens": 12566099.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.7651821862348178, | |
| "grad_norm": 7.04566661191058, | |
| "learning_rate": 1.2388663967611336e-05, | |
| "loss": 13.6875, | |
| "mean_token_accuracy": 0.6022654175758362, | |
| "num_tokens": 12634407.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 7.697888634722549, | |
| "learning_rate": 1.2348178137651824e-05, | |
| "loss": 13.5703, | |
| "mean_token_accuracy": 0.6025985479354858, | |
| "num_tokens": 12695120.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "eval_loss": 1.734375, | |
| "eval_mean_token_accuracy": 0.5960522890090942, | |
| "eval_num_tokens": 12695120.0, | |
| "eval_runtime": 0.6494, | |
| "eval_samples_per_second": 307.96, | |
| "eval_steps_per_second": 6.159, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7732793522267206, | |
| "grad_norm": 6.785752803586286, | |
| "learning_rate": 1.230769230769231e-05, | |
| "loss": 13.2031, | |
| "mean_token_accuracy": 0.6083678603172302, | |
| "num_tokens": 12765477.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.7773279352226721, | |
| "grad_norm": 8.429385939587196, | |
| "learning_rate": 1.2267206477732794e-05, | |
| "loss": 13.6719, | |
| "mean_token_accuracy": 0.5998564958572388, | |
| "num_tokens": 12835216.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.7813765182186235, | |
| "grad_norm": 7.85611095251478, | |
| "learning_rate": 1.222672064777328e-05, | |
| "loss": 13.1328, | |
| "mean_token_accuracy": 0.6025688648223877, | |
| "num_tokens": 12902937.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.7854251012145749, | |
| "grad_norm": 7.6755487929843556, | |
| "learning_rate": 1.2186234817813766e-05, | |
| "loss": 13.0703, | |
| "mean_token_accuracy": 0.6151934266090393, | |
| "num_tokens": 12958920.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 9.563095955991269, | |
| "learning_rate": 1.2145748987854251e-05, | |
| "loss": 13.75, | |
| "mean_token_accuracy": 0.5952048301696777, | |
| "num_tokens": 13019128.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "eval_loss": 1.730625033378601, | |
| "eval_mean_token_accuracy": 0.5964938104152679, | |
| "eval_num_tokens": 13019128.0, | |
| "eval_runtime": 0.6302, | |
| "eval_samples_per_second": 317.371, | |
| "eval_steps_per_second": 6.347, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7935222672064778, | |
| "grad_norm": 7.513918136227913, | |
| "learning_rate": 1.2105263157894737e-05, | |
| "loss": 14.5703, | |
| "mean_token_accuracy": 0.5844899415969849, | |
| "num_tokens": 13085188.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.7975708502024291, | |
| "grad_norm": 7.287761719869118, | |
| "learning_rate": 1.2064777327935225e-05, | |
| "loss": 13.6484, | |
| "mean_token_accuracy": 0.5984832048416138, | |
| "num_tokens": 13155269.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8016194331983806, | |
| "grad_norm": 7.383829587566862, | |
| "learning_rate": 1.202429149797571e-05, | |
| "loss": 14.5156, | |
| "mean_token_accuracy": 0.5839080214500427, | |
| "num_tokens": 13229283.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.805668016194332, | |
| "grad_norm": 12.228642188104056, | |
| "learning_rate": 1.1983805668016195e-05, | |
| "loss": 14.0234, | |
| "mean_token_accuracy": 0.5865586400032043, | |
| "num_tokens": 13298358.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8097165991902834, | |
| "grad_norm": 8.01540634329265, | |
| "learning_rate": 1.1943319838056682e-05, | |
| "loss": 13.4844, | |
| "mean_token_accuracy": 0.603053867816925, | |
| "num_tokens": 13366925.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8097165991902834, | |
| "eval_loss": 1.7278125286102295, | |
| "eval_mean_token_accuracy": 0.5974646657705307, | |
| "eval_num_tokens": 13366925.0, | |
| "eval_runtime": 0.6458, | |
| "eval_samples_per_second": 309.671, | |
| "eval_steps_per_second": 6.193, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8137651821862348, | |
| "grad_norm": 8.670916760818496, | |
| "learning_rate": 1.1902834008097166e-05, | |
| "loss": 13.5781, | |
| "mean_token_accuracy": 0.5980917811393738, | |
| "num_tokens": 13423691.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.8178137651821862, | |
| "grad_norm": 8.271091772585052, | |
| "learning_rate": 1.1862348178137652e-05, | |
| "loss": 13.5938, | |
| "mean_token_accuracy": 0.6054921746253967, | |
| "num_tokens": 13484423.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.8218623481781376, | |
| "grad_norm": 7.300340294135019, | |
| "learning_rate": 1.182186234817814e-05, | |
| "loss": 13.1016, | |
| "mean_token_accuracy": 0.60859215259552, | |
| "num_tokens": 13559695.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.8259109311740891, | |
| "grad_norm": 9.1407332883229, | |
| "learning_rate": 1.1781376518218623e-05, | |
| "loss": 14.1406, | |
| "mean_token_accuracy": 0.5899603366851807, | |
| "num_tokens": 13645280.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.8299595141700404, | |
| "grad_norm": 9.22684063564975, | |
| "learning_rate": 1.174089068825911e-05, | |
| "loss": 13.6562, | |
| "mean_token_accuracy": 0.5998020768165588, | |
| "num_tokens": 13712046.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8299595141700404, | |
| "eval_loss": 1.7253124713897705, | |
| "eval_mean_token_accuracy": 0.5980972796678543, | |
| "eval_num_tokens": 13712046.0, | |
| "eval_runtime": 0.6282, | |
| "eval_samples_per_second": 318.347, | |
| "eval_steps_per_second": 6.367, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8340080971659919, | |
| "grad_norm": 7.12821183237729, | |
| "learning_rate": 1.1700404858299597e-05, | |
| "loss": 14.1094, | |
| "mean_token_accuracy": 0.591342568397522, | |
| "num_tokens": 13776702.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.8380566801619433, | |
| "grad_norm": 8.729950014794346, | |
| "learning_rate": 1.1659919028340081e-05, | |
| "loss": 14.2266, | |
| "mean_token_accuracy": 0.5930887460708618, | |
| "num_tokens": 13836408.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 7.301044795096497, | |
| "learning_rate": 1.1619433198380567e-05, | |
| "loss": 14.4922, | |
| "mean_token_accuracy": 0.5839249491691589, | |
| "num_tokens": 13900559.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.8461538461538461, | |
| "grad_norm": 7.369160590617854, | |
| "learning_rate": 1.1578947368421053e-05, | |
| "loss": 13.5703, | |
| "mean_token_accuracy": 0.6064897775650024, | |
| "num_tokens": 13970733.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.8502024291497976, | |
| "grad_norm": 7.851482544759256, | |
| "learning_rate": 1.1538461538461538e-05, | |
| "loss": 13.1406, | |
| "mean_token_accuracy": 0.6058458089828491, | |
| "num_tokens": 14039496.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8502024291497976, | |
| "eval_loss": 1.7237499952316284, | |
| "eval_mean_token_accuracy": 0.5978213995695114, | |
| "eval_num_tokens": 14039496.0, | |
| "eval_runtime": 0.6629, | |
| "eval_samples_per_second": 301.705, | |
| "eval_steps_per_second": 6.034, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.854251012145749, | |
| "grad_norm": 7.716996728334529, | |
| "learning_rate": 1.1497975708502026e-05, | |
| "loss": 13.8281, | |
| "mean_token_accuracy": 0.5954342484474182, | |
| "num_tokens": 14102200.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.8582995951417004, | |
| "grad_norm": 8.015987460003101, | |
| "learning_rate": 1.1457489878542512e-05, | |
| "loss": 14.3672, | |
| "mean_token_accuracy": 0.5886507034301758, | |
| "num_tokens": 14170374.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.8623481781376519, | |
| "grad_norm": 7.8139654343716565, | |
| "learning_rate": 1.1417004048582996e-05, | |
| "loss": 14.0312, | |
| "mean_token_accuracy": 0.5954753756523132, | |
| "num_tokens": 14231040.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.8663967611336032, | |
| "grad_norm": 7.595765550190981, | |
| "learning_rate": 1.1376518218623482e-05, | |
| "loss": 13.9375, | |
| "mean_token_accuracy": 0.5962504744529724, | |
| "num_tokens": 14296979.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.8704453441295547, | |
| "grad_norm": 8.046423435434846, | |
| "learning_rate": 1.1336032388663969e-05, | |
| "loss": 14.6328, | |
| "mean_token_accuracy": 0.5841693878173828, | |
| "num_tokens": 14372997.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8704453441295547, | |
| "eval_loss": 1.720312476158142, | |
| "eval_mean_token_accuracy": 0.5980706065893173, | |
| "eval_num_tokens": 14372997.0, | |
| "eval_runtime": 0.6391, | |
| "eval_samples_per_second": 312.957, | |
| "eval_steps_per_second": 6.259, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8744939271255061, | |
| "grad_norm": 8.7705422913405, | |
| "learning_rate": 1.1295546558704453e-05, | |
| "loss": 14.0859, | |
| "mean_token_accuracy": 0.5949749946594238, | |
| "num_tokens": 14438573.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.8785425101214575, | |
| "grad_norm": 7.832794988425434, | |
| "learning_rate": 1.1255060728744939e-05, | |
| "loss": 13.1484, | |
| "mean_token_accuracy": 0.613569438457489, | |
| "num_tokens": 14501484.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.8825910931174089, | |
| "grad_norm": 7.669341614248951, | |
| "learning_rate": 1.1214574898785427e-05, | |
| "loss": 14.7031, | |
| "mean_token_accuracy": 0.5858564972877502, | |
| "num_tokens": 14574868.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.8866396761133604, | |
| "grad_norm": 7.973334988452109, | |
| "learning_rate": 1.1174089068825913e-05, | |
| "loss": 13.8047, | |
| "mean_token_accuracy": 0.5929080247879028, | |
| "num_tokens": 14635874.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.8906882591093117, | |
| "grad_norm": 9.162769594031285, | |
| "learning_rate": 1.1133603238866398e-05, | |
| "loss": 14.6016, | |
| "mean_token_accuracy": 0.5782532095909119, | |
| "num_tokens": 14698568.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8906882591093117, | |
| "eval_loss": 1.7178125381469727, | |
| "eval_mean_token_accuracy": 0.5981580018997192, | |
| "eval_num_tokens": 14698568.0, | |
| "eval_runtime": 0.6351, | |
| "eval_samples_per_second": 314.936, | |
| "eval_steps_per_second": 6.299, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 7.206939771632334, | |
| "learning_rate": 1.1093117408906884e-05, | |
| "loss": 13.6406, | |
| "mean_token_accuracy": 0.5951396226882935, | |
| "num_tokens": 14762701.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.8987854251012146, | |
| "grad_norm": 8.38441745268265, | |
| "learning_rate": 1.105263157894737e-05, | |
| "loss": 14.3203, | |
| "mean_token_accuracy": 0.5884565114974976, | |
| "num_tokens": 14836157.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.902834008097166, | |
| "grad_norm": 7.099567588103198, | |
| "learning_rate": 1.1012145748987854e-05, | |
| "loss": 13.6875, | |
| "mean_token_accuracy": 0.6000562310218811, | |
| "num_tokens": 14903783.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.9068825910931174, | |
| "grad_norm": 7.639650741767317, | |
| "learning_rate": 1.0971659919028342e-05, | |
| "loss": 13.1719, | |
| "mean_token_accuracy": 0.6127068996429443, | |
| "num_tokens": 14964145.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.9109311740890689, | |
| "grad_norm": 7.438961030040257, | |
| "learning_rate": 1.0931174089068828e-05, | |
| "loss": 14.3281, | |
| "mean_token_accuracy": 0.5837487578392029, | |
| "num_tokens": 15042849.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9109311740890689, | |
| "eval_loss": 1.7165625095367432, | |
| "eval_mean_token_accuracy": 0.5983149856328964, | |
| "eval_num_tokens": 15042849.0, | |
| "eval_runtime": 0.6312, | |
| "eval_samples_per_second": 316.878, | |
| "eval_steps_per_second": 6.338, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9149797570850202, | |
| "grad_norm": 8.491938043424565, | |
| "learning_rate": 1.0890688259109313e-05, | |
| "loss": 13.75, | |
| "mean_token_accuracy": 0.5973576903343201, | |
| "num_tokens": 15107705.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.9190283400809717, | |
| "grad_norm": 7.63291712789061, | |
| "learning_rate": 1.0850202429149799e-05, | |
| "loss": 13.9375, | |
| "mean_token_accuracy": 0.5994954109191895, | |
| "num_tokens": 15179511.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 7.056222313978797, | |
| "learning_rate": 1.0809716599190285e-05, | |
| "loss": 13.75, | |
| "mean_token_accuracy": 0.6037131547927856, | |
| "num_tokens": 15240170.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.9271255060728745, | |
| "grad_norm": 7.894406360447493, | |
| "learning_rate": 1.076923076923077e-05, | |
| "loss": 13.4609, | |
| "mean_token_accuracy": 0.5994337201118469, | |
| "num_tokens": 15300626.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.9311740890688259, | |
| "grad_norm": 8.960700403802914, | |
| "learning_rate": 1.0728744939271255e-05, | |
| "loss": 13.7578, | |
| "mean_token_accuracy": 0.5973832607269287, | |
| "num_tokens": 15366420.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9311740890688259, | |
| "eval_loss": 1.713437557220459, | |
| "eval_mean_token_accuracy": 0.5988392233848572, | |
| "eval_num_tokens": 15366420.0, | |
| "eval_runtime": 0.6821, | |
| "eval_samples_per_second": 293.227, | |
| "eval_steps_per_second": 5.865, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9352226720647774, | |
| "grad_norm": 8.458253193834494, | |
| "learning_rate": 1.0688259109311743e-05, | |
| "loss": 12.6172, | |
| "mean_token_accuracy": 0.6170424222946167, | |
| "num_tokens": 15421793.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.9392712550607287, | |
| "grad_norm": 7.0686015225039665, | |
| "learning_rate": 1.0647773279352228e-05, | |
| "loss": 13.5625, | |
| "mean_token_accuracy": 0.607692301273346, | |
| "num_tokens": 15485817.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.9433198380566802, | |
| "grad_norm": 7.576993960188947, | |
| "learning_rate": 1.0607287449392714e-05, | |
| "loss": 13.8984, | |
| "mean_token_accuracy": 0.5907003879547119, | |
| "num_tokens": 15564078.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 6.97349951130989, | |
| "learning_rate": 1.05668016194332e-05, | |
| "loss": 13.2969, | |
| "mean_token_accuracy": 0.601026713848114, | |
| "num_tokens": 15633099.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.951417004048583, | |
| "grad_norm": 7.03291868031142, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 13.9062, | |
| "mean_token_accuracy": 0.5920431613922119, | |
| "num_tokens": 15710279.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.951417004048583, | |
| "eval_loss": 1.7118749618530273, | |
| "eval_mean_token_accuracy": 0.5990229398012161, | |
| "eval_num_tokens": 15710279.0, | |
| "eval_runtime": 0.6566, | |
| "eval_samples_per_second": 304.588, | |
| "eval_steps_per_second": 6.092, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9554655870445344, | |
| "grad_norm": 6.669888447299149, | |
| "learning_rate": 1.048582995951417e-05, | |
| "loss": 14.1641, | |
| "mean_token_accuracy": 0.5870757102966309, | |
| "num_tokens": 15788970.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.9595141700404858, | |
| "grad_norm": 7.4727551311500635, | |
| "learning_rate": 1.0445344129554658e-05, | |
| "loss": 13.3203, | |
| "mean_token_accuracy": 0.6026445031166077, | |
| "num_tokens": 15854074.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.9635627530364372, | |
| "grad_norm": 7.907361177198574, | |
| "learning_rate": 1.0404858299595141e-05, | |
| "loss": 12.4453, | |
| "mean_token_accuracy": 0.6250936388969421, | |
| "num_tokens": 15915540.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.9676113360323887, | |
| "grad_norm": 8.43850635314641, | |
| "learning_rate": 1.0364372469635629e-05, | |
| "loss": 13.1562, | |
| "mean_token_accuracy": 0.6030080318450928, | |
| "num_tokens": 15976973.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.97165991902834, | |
| "grad_norm": 7.4122240402160875, | |
| "learning_rate": 1.0323886639676115e-05, | |
| "loss": 13.0938, | |
| "mean_token_accuracy": 0.6093297004699707, | |
| "num_tokens": 16032923.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.97165991902834, | |
| "eval_loss": 1.709375023841858, | |
| "eval_mean_token_accuracy": 0.5996900647878647, | |
| "eval_num_tokens": 16032923.0, | |
| "eval_runtime": 0.671, | |
| "eval_samples_per_second": 298.041, | |
| "eval_steps_per_second": 5.961, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9757085020242915, | |
| "grad_norm": 7.354557905255342, | |
| "learning_rate": 1.02834008097166e-05, | |
| "loss": 14.0156, | |
| "mean_token_accuracy": 0.6022666692733765, | |
| "num_tokens": 16094928.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.979757085020243, | |
| "grad_norm": 6.816456710472039, | |
| "learning_rate": 1.0242914979757086e-05, | |
| "loss": 13.2266, | |
| "mean_token_accuracy": 0.6128579378128052, | |
| "num_tokens": 16164039.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.9838056680161943, | |
| "grad_norm": 7.422654481239377, | |
| "learning_rate": 1.0202429149797572e-05, | |
| "loss": 14.2344, | |
| "mean_token_accuracy": 0.5937775373458862, | |
| "num_tokens": 16241339.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.9878542510121457, | |
| "grad_norm": 6.404505160904721, | |
| "learning_rate": 1.0161943319838056e-05, | |
| "loss": 12.7969, | |
| "mean_token_accuracy": 0.6247119903564453, | |
| "num_tokens": 16315626.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.9919028340080972, | |
| "grad_norm": 6.767676657987019, | |
| "learning_rate": 1.0121457489878544e-05, | |
| "loss": 13.0391, | |
| "mean_token_accuracy": 0.6165784597396851, | |
| "num_tokens": 16380702.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9919028340080972, | |
| "eval_loss": 1.7059375047683716, | |
| "eval_mean_token_accuracy": 0.5998604446649551, | |
| "eval_num_tokens": 16380702.0, | |
| "eval_runtime": 0.6318, | |
| "eval_samples_per_second": 316.572, | |
| "eval_steps_per_second": 6.331, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9959514170040485, | |
| "grad_norm": 8.40463068782111, | |
| "learning_rate": 1.008097165991903e-05, | |
| "loss": 13.5469, | |
| "mean_token_accuracy": 0.6080746054649353, | |
| "num_tokens": 16451754.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 8.465098958585465, | |
| "learning_rate": 1.0040485829959515e-05, | |
| "loss": 13.3047, | |
| "mean_token_accuracy": 0.609171450138092, | |
| "num_tokens": 16516606.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.0040485829959513, | |
| "grad_norm": 6.50385357417948, | |
| "learning_rate": 1e-05, | |
| "loss": 13.4297, | |
| "mean_token_accuracy": 0.6065150499343872, | |
| "num_tokens": 16584820.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.008097165991903, | |
| "grad_norm": 7.977746144483265, | |
| "learning_rate": 9.959514170040487e-06, | |
| "loss": 13.5391, | |
| "mean_token_accuracy": 0.6004602313041687, | |
| "num_tokens": 16652244.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.0121457489878543, | |
| "grad_norm": 7.116151586707309, | |
| "learning_rate": 9.919028340080973e-06, | |
| "loss": 13.3516, | |
| "mean_token_accuracy": 0.599148154258728, | |
| "num_tokens": 16715698.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0121457489878543, | |
| "eval_loss": 1.704687476158142, | |
| "eval_mean_token_accuracy": 0.5997532159090042, | |
| "eval_num_tokens": 16715698.0, | |
| "eval_runtime": 0.6344, | |
| "eval_samples_per_second": 315.241, | |
| "eval_steps_per_second": 6.305, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0161943319838056, | |
| "grad_norm": 7.204779568487317, | |
| "learning_rate": 9.878542510121458e-06, | |
| "loss": 13.4922, | |
| "mean_token_accuracy": 0.6105347275733948, | |
| "num_tokens": 16782076.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.0202429149797572, | |
| "grad_norm": 6.934398308018714, | |
| "learning_rate": 9.838056680161944e-06, | |
| "loss": 12.2969, | |
| "mean_token_accuracy": 0.6307468414306641, | |
| "num_tokens": 16844000.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.0242914979757085, | |
| "grad_norm": 6.262390941001453, | |
| "learning_rate": 9.79757085020243e-06, | |
| "loss": 12.1875, | |
| "mean_token_accuracy": 0.6304177045822144, | |
| "num_tokens": 16913997.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.0283400809716599, | |
| "grad_norm": 7.26353805403556, | |
| "learning_rate": 9.757085020242916e-06, | |
| "loss": 13.0234, | |
| "mean_token_accuracy": 0.6181889176368713, | |
| "num_tokens": 16975505.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.0323886639676114, | |
| "grad_norm": 7.253371839731625, | |
| "learning_rate": 9.7165991902834e-06, | |
| "loss": 12.9297, | |
| "mean_token_accuracy": 0.6192678809165955, | |
| "num_tokens": 17045831.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0323886639676114, | |
| "eval_loss": 1.7037500143051147, | |
| "eval_mean_token_accuracy": 0.600011944770813, | |
| "eval_num_tokens": 17045831.0, | |
| "eval_runtime": 0.6321, | |
| "eval_samples_per_second": 316.386, | |
| "eval_steps_per_second": 6.328, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0364372469635628, | |
| "grad_norm": 7.103579854054674, | |
| "learning_rate": 9.676113360323888e-06, | |
| "loss": 13.3125, | |
| "mean_token_accuracy": 0.6100465655326843, | |
| "num_tokens": 17114834.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.040485829959514, | |
| "grad_norm": 7.116930652514039, | |
| "learning_rate": 9.635627530364373e-06, | |
| "loss": 13.9219, | |
| "mean_token_accuracy": 0.5969142317771912, | |
| "num_tokens": 17185739.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.0445344129554657, | |
| "grad_norm": 7.377322519725867, | |
| "learning_rate": 9.595141700404859e-06, | |
| "loss": 12.9922, | |
| "mean_token_accuracy": 0.6139745712280273, | |
| "num_tokens": 17239157.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.048582995951417, | |
| "grad_norm": 6.975650481941647, | |
| "learning_rate": 9.554655870445345e-06, | |
| "loss": 13.4297, | |
| "mean_token_accuracy": 0.6126266121864319, | |
| "num_tokens": 17303593.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 6.359499519445163, | |
| "learning_rate": 9.514170040485831e-06, | |
| "loss": 13.2734, | |
| "mean_token_accuracy": 0.6133294105529785, | |
| "num_tokens": 17368146.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "eval_loss": 1.703125, | |
| "eval_mean_token_accuracy": 0.6001382917165756, | |
| "eval_num_tokens": 17368146.0, | |
| "eval_runtime": 0.6446, | |
| "eval_samples_per_second": 310.258, | |
| "eval_steps_per_second": 6.205, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.05668016194332, | |
| "grad_norm": 6.671493038575028, | |
| "learning_rate": 9.473684210526315e-06, | |
| "loss": 12.0781, | |
| "mean_token_accuracy": 0.6352812647819519, | |
| "num_tokens": 17428599.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.0607287449392713, | |
| "grad_norm": 7.378756810172295, | |
| "learning_rate": 9.433198380566803e-06, | |
| "loss": 12.2734, | |
| "mean_token_accuracy": 0.6308179497718811, | |
| "num_tokens": 17491185.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.0647773279352226, | |
| "grad_norm": 7.720411271822566, | |
| "learning_rate": 9.392712550607288e-06, | |
| "loss": 13.9922, | |
| "mean_token_accuracy": 0.5976829528808594, | |
| "num_tokens": 17558922.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.0688259109311742, | |
| "grad_norm": 7.148823701274059, | |
| "learning_rate": 9.352226720647774e-06, | |
| "loss": 12.6562, | |
| "mean_token_accuracy": 0.6221409440040588, | |
| "num_tokens": 17625704.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.0728744939271255, | |
| "grad_norm": 6.952211161763132, | |
| "learning_rate": 9.31174089068826e-06, | |
| "loss": 13.0, | |
| "mean_token_accuracy": 0.6169036030769348, | |
| "num_tokens": 17693375.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.0728744939271255, | |
| "eval_loss": 1.7009375095367432, | |
| "eval_mean_token_accuracy": 0.6005319058895111, | |
| "eval_num_tokens": 17693375.0, | |
| "eval_runtime": 0.6522, | |
| "eval_samples_per_second": 306.677, | |
| "eval_steps_per_second": 6.134, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.0769230769230769, | |
| "grad_norm": 7.532405610943497, | |
| "learning_rate": 9.271255060728746e-06, | |
| "loss": 13.4609, | |
| "mean_token_accuracy": 0.6039229035377502, | |
| "num_tokens": 17756505.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.0809716599190284, | |
| "grad_norm": 6.463674669796744, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 12.7422, | |
| "mean_token_accuracy": 0.6189759969711304, | |
| "num_tokens": 17827037.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.0850202429149798, | |
| "grad_norm": 7.120027379484065, | |
| "learning_rate": 9.190283400809717e-06, | |
| "loss": 13.7656, | |
| "mean_token_accuracy": 0.5981738567352295, | |
| "num_tokens": 17892704.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.0890688259109311, | |
| "grad_norm": 6.2456984016212065, | |
| "learning_rate": 9.149797570850203e-06, | |
| "loss": 12.7344, | |
| "mean_token_accuracy": 0.6219497919082642, | |
| "num_tokens": 17968213.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.0931174089068827, | |
| "grad_norm": 6.297910754288368, | |
| "learning_rate": 9.109311740890689e-06, | |
| "loss": 12.7188, | |
| "mean_token_accuracy": 0.6200737357139587, | |
| "num_tokens": 18039604.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0931174089068827, | |
| "eval_loss": 1.7000000476837158, | |
| "eval_mean_token_accuracy": 0.6004240810871124, | |
| "eval_num_tokens": 18039604.0, | |
| "eval_runtime": 0.6437, | |
| "eval_samples_per_second": 310.685, | |
| "eval_steps_per_second": 6.214, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.097165991902834, | |
| "grad_norm": 7.198610948868953, | |
| "learning_rate": 9.068825910931175e-06, | |
| "loss": 12.2812, | |
| "mean_token_accuracy": 0.6254372000694275, | |
| "num_tokens": 18096278.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.1012145748987854, | |
| "grad_norm": 6.306684772679187, | |
| "learning_rate": 9.02834008097166e-06, | |
| "loss": 13.7031, | |
| "mean_token_accuracy": 0.6017244458198547, | |
| "num_tokens": 18175556.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 6.50769292978356, | |
| "learning_rate": 8.987854251012147e-06, | |
| "loss": 12.2578, | |
| "mean_token_accuracy": 0.6318495273590088, | |
| "num_tokens": 18243307.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.1093117408906883, | |
| "grad_norm": 6.913532871754586, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 12.9922, | |
| "mean_token_accuracy": 0.61337810754776, | |
| "num_tokens": 18309195.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.1133603238866396, | |
| "grad_norm": 6.884920896555758, | |
| "learning_rate": 8.906882591093118e-06, | |
| "loss": 13.875, | |
| "mean_token_accuracy": 0.6053746342658997, | |
| "num_tokens": 18376761.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.1133603238866396, | |
| "eval_loss": 1.6981250047683716, | |
| "eval_mean_token_accuracy": 0.6006451994180679, | |
| "eval_num_tokens": 18376761.0, | |
| "eval_runtime": 0.6441, | |
| "eval_samples_per_second": 310.493, | |
| "eval_steps_per_second": 6.21, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.117408906882591, | |
| "grad_norm": 7.596057850426039, | |
| "learning_rate": 8.866396761133604e-06, | |
| "loss": 13.5703, | |
| "mean_token_accuracy": 0.5959470868110657, | |
| "num_tokens": 18438213.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.1214574898785425, | |
| "grad_norm": 7.244460879105897, | |
| "learning_rate": 8.82591093117409e-06, | |
| "loss": 13.6094, | |
| "mean_token_accuracy": 0.6075268983840942, | |
| "num_tokens": 18508957.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.125506072874494, | |
| "grad_norm": 7.695579384649985, | |
| "learning_rate": 8.785425101214575e-06, | |
| "loss": 13.1641, | |
| "mean_token_accuracy": 0.6176554560661316, | |
| "num_tokens": 18565083.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.1295546558704452, | |
| "grad_norm": 6.522870352716677, | |
| "learning_rate": 8.744939271255063e-06, | |
| "loss": 12.6953, | |
| "mean_token_accuracy": 0.6225432753562927, | |
| "num_tokens": 18633531.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.1336032388663968, | |
| "grad_norm": 7.604533792660402, | |
| "learning_rate": 8.704453441295547e-06, | |
| "loss": 12.4688, | |
| "mean_token_accuracy": 0.6240532398223877, | |
| "num_tokens": 18689313.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1336032388663968, | |
| "eval_loss": 1.6974999904632568, | |
| "eval_mean_token_accuracy": 0.6007698774337769, | |
| "eval_num_tokens": 18689313.0, | |
| "eval_runtime": 0.6368, | |
| "eval_samples_per_second": 314.051, | |
| "eval_steps_per_second": 6.281, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1376518218623481, | |
| "grad_norm": 6.192506768790122, | |
| "learning_rate": 8.663967611336033e-06, | |
| "loss": 13.0312, | |
| "mean_token_accuracy": 0.6120434999465942, | |
| "num_tokens": 18756467.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.1417004048582995, | |
| "grad_norm": 5.735298339214011, | |
| "learning_rate": 8.62348178137652e-06, | |
| "loss": 12.3672, | |
| "mean_token_accuracy": 0.6290688514709473, | |
| "num_tokens": 18830631.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.145748987854251, | |
| "grad_norm": 6.8807292772906985, | |
| "learning_rate": 8.582995951417005e-06, | |
| "loss": 12.9688, | |
| "mean_token_accuracy": 0.6203005313873291, | |
| "num_tokens": 18902033.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.1497975708502024, | |
| "grad_norm": 6.299127839988924, | |
| "learning_rate": 8.54251012145749e-06, | |
| "loss": 11.7344, | |
| "mean_token_accuracy": 0.6383894085884094, | |
| "num_tokens": 18965552.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 7.602932263969704, | |
| "learning_rate": 8.502024291497976e-06, | |
| "loss": 13.7656, | |
| "mean_token_accuracy": 0.6015710830688477, | |
| "num_tokens": 19032067.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "eval_loss": 1.6959375143051147, | |
| "eval_mean_token_accuracy": 0.6004112809896469, | |
| "eval_num_tokens": 19032067.0, | |
| "eval_runtime": 0.6551, | |
| "eval_samples_per_second": 305.283, | |
| "eval_steps_per_second": 6.106, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 6.900267004886904, | |
| "learning_rate": 8.461538461538462e-06, | |
| "loss": 12.1562, | |
| "mean_token_accuracy": 0.6340711116790771, | |
| "num_tokens": 19091771.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.1619433198380567, | |
| "grad_norm": 7.015232796047964, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 12.7578, | |
| "mean_token_accuracy": 0.6194669604301453, | |
| "num_tokens": 19160875.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.165991902834008, | |
| "grad_norm": 8.07776854225312, | |
| "learning_rate": 8.380566801619434e-06, | |
| "loss": 12.8672, | |
| "mean_token_accuracy": 0.621356189250946, | |
| "num_tokens": 19234248.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.1700404858299596, | |
| "grad_norm": 6.886729565425438, | |
| "learning_rate": 8.340080971659919e-06, | |
| "loss": 13.3125, | |
| "mean_token_accuracy": 0.6096766591072083, | |
| "num_tokens": 19300037.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.174089068825911, | |
| "grad_norm": 7.451714517536367, | |
| "learning_rate": 8.299595141700405e-06, | |
| "loss": 12.9062, | |
| "mean_token_accuracy": 0.6140288710594177, | |
| "num_tokens": 19359507.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.174089068825911, | |
| "eval_loss": 1.6950000524520874, | |
| "eval_mean_token_accuracy": 0.600911945104599, | |
| "eval_num_tokens": 19359507.0, | |
| "eval_runtime": 0.6622, | |
| "eval_samples_per_second": 302.012, | |
| "eval_steps_per_second": 6.04, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1781376518218623, | |
| "grad_norm": 6.19859060055071, | |
| "learning_rate": 8.259109311740891e-06, | |
| "loss": 12.9922, | |
| "mean_token_accuracy": 0.6148094534873962, | |
| "num_tokens": 19426042.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.1821862348178138, | |
| "grad_norm": 6.3346510938910345, | |
| "learning_rate": 8.218623481781377e-06, | |
| "loss": 12.8516, | |
| "mean_token_accuracy": 0.615065336227417, | |
| "num_tokens": 19494767.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.1862348178137652, | |
| "grad_norm": 7.185444383014579, | |
| "learning_rate": 8.178137651821862e-06, | |
| "loss": 12.8281, | |
| "mean_token_accuracy": 0.6221780776977539, | |
| "num_tokens": 19561762.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.1902834008097165, | |
| "grad_norm": 7.763951709420421, | |
| "learning_rate": 8.13765182186235e-06, | |
| "loss": 13.5312, | |
| "mean_token_accuracy": 0.6010787487030029, | |
| "num_tokens": 19629313.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.194331983805668, | |
| "grad_norm": 6.6829771026826785, | |
| "learning_rate": 8.097165991902834e-06, | |
| "loss": 12.6875, | |
| "mean_token_accuracy": 0.6254391074180603, | |
| "num_tokens": 19699694.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.194331983805668, | |
| "eval_loss": 1.6931250095367432, | |
| "eval_mean_token_accuracy": 0.6008877456188202, | |
| "eval_num_tokens": 19699694.0, | |
| "eval_runtime": 0.6362, | |
| "eval_samples_per_second": 314.38, | |
| "eval_steps_per_second": 6.288, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.1983805668016194, | |
| "grad_norm": 6.897120986568694, | |
| "learning_rate": 8.056680161943322e-06, | |
| "loss": 12.2109, | |
| "mean_token_accuracy": 0.6307302713394165, | |
| "num_tokens": 19757434.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.2024291497975708, | |
| "grad_norm": 7.44384813842545, | |
| "learning_rate": 8.016194331983806e-06, | |
| "loss": 12.7656, | |
| "mean_token_accuracy": 0.6187056303024292, | |
| "num_tokens": 19820570.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.2064777327935223, | |
| "grad_norm": 6.4203676546627015, | |
| "learning_rate": 7.975708502024292e-06, | |
| "loss": 13.2812, | |
| "mean_token_accuracy": 0.6042314767837524, | |
| "num_tokens": 19890728.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 6.848564846162855, | |
| "learning_rate": 7.935222672064778e-06, | |
| "loss": 13.0703, | |
| "mean_token_accuracy": 0.6080402731895447, | |
| "num_tokens": 19959322.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.214574898785425, | |
| "grad_norm": 6.862334865838493, | |
| "learning_rate": 7.894736842105265e-06, | |
| "loss": 13.3516, | |
| "mean_token_accuracy": 0.6005032062530518, | |
| "num_tokens": 20031722.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.214574898785425, | |
| "eval_loss": 1.691562533378601, | |
| "eval_mean_token_accuracy": 0.6006248891353607, | |
| "eval_num_tokens": 20031722.0, | |
| "eval_runtime": 0.6358, | |
| "eval_samples_per_second": 314.55, | |
| "eval_steps_per_second": 6.291, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2186234817813766, | |
| "grad_norm": 7.001796551105443, | |
| "learning_rate": 7.854251012145749e-06, | |
| "loss": 12.7109, | |
| "mean_token_accuracy": 0.6257545948028564, | |
| "num_tokens": 20104837.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.222672064777328, | |
| "grad_norm": 7.289919019857029, | |
| "learning_rate": 7.813765182186235e-06, | |
| "loss": 12.8828, | |
| "mean_token_accuracy": 0.6084967851638794, | |
| "num_tokens": 20172668.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.2267206477732793, | |
| "grad_norm": 6.345630914099327, | |
| "learning_rate": 7.773279352226721e-06, | |
| "loss": 12.9141, | |
| "mean_token_accuracy": 0.6153846383094788, | |
| "num_tokens": 20241372.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.2307692307692308, | |
| "grad_norm": 7.5520644152330245, | |
| "learning_rate": 7.732793522267207e-06, | |
| "loss": 13.6797, | |
| "mean_token_accuracy": 0.607166588306427, | |
| "num_tokens": 20310730.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.2348178137651822, | |
| "grad_norm": 8.642720338892964, | |
| "learning_rate": 7.692307692307694e-06, | |
| "loss": 12.9219, | |
| "mean_token_accuracy": 0.6174276471138, | |
| "num_tokens": 20378755.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2348178137651822, | |
| "eval_loss": 1.6906249523162842, | |
| "eval_mean_token_accuracy": 0.601470410823822, | |
| "eval_num_tokens": 20378755.0, | |
| "eval_runtime": 0.6338, | |
| "eval_samples_per_second": 315.573, | |
| "eval_steps_per_second": 6.311, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2388663967611335, | |
| "grad_norm": 6.901290072516897, | |
| "learning_rate": 7.651821862348178e-06, | |
| "loss": 13.1094, | |
| "mean_token_accuracy": 0.6087667942047119, | |
| "num_tokens": 20449130.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.242914979757085, | |
| "grad_norm": 5.8437165539153995, | |
| "learning_rate": 7.611336032388664e-06, | |
| "loss": 12.1797, | |
| "mean_token_accuracy": 0.6274487972259521, | |
| "num_tokens": 20511878.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.2469635627530364, | |
| "grad_norm": 6.275119502029468, | |
| "learning_rate": 7.570850202429151e-06, | |
| "loss": 13.0, | |
| "mean_token_accuracy": 0.6140267252922058, | |
| "num_tokens": 20580939.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.2510121457489878, | |
| "grad_norm": 6.632528297788201, | |
| "learning_rate": 7.5303643724696364e-06, | |
| "loss": 12.2969, | |
| "mean_token_accuracy": 0.6276517510414124, | |
| "num_tokens": 20646337.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.2550607287449393, | |
| "grad_norm": 6.802264706016231, | |
| "learning_rate": 7.489878542510122e-06, | |
| "loss": 12.5312, | |
| "mean_token_accuracy": 0.6189518570899963, | |
| "num_tokens": 20706089.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2550607287449393, | |
| "eval_loss": 1.6903125047683716, | |
| "eval_mean_token_accuracy": 0.6015071421861649, | |
| "eval_num_tokens": 20706089.0, | |
| "eval_runtime": 0.6486, | |
| "eval_samples_per_second": 308.342, | |
| "eval_steps_per_second": 6.167, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2591093117408907, | |
| "grad_norm": 6.618283699689559, | |
| "learning_rate": 7.449392712550608e-06, | |
| "loss": 12.2422, | |
| "mean_token_accuracy": 0.6300427317619324, | |
| "num_tokens": 20776353.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 6.385834652503417, | |
| "learning_rate": 7.408906882591094e-06, | |
| "loss": 13.1641, | |
| "mean_token_accuracy": 0.6120158433914185, | |
| "num_tokens": 20839051.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.2672064777327936, | |
| "grad_norm": 6.604656501463769, | |
| "learning_rate": 7.368421052631579e-06, | |
| "loss": 12.3516, | |
| "mean_token_accuracy": 0.6265976428985596, | |
| "num_tokens": 20907888.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.271255060728745, | |
| "grad_norm": 6.333208429557312, | |
| "learning_rate": 7.327935222672065e-06, | |
| "loss": 12.2422, | |
| "mean_token_accuracy": 0.6266438961029053, | |
| "num_tokens": 20973573.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.2753036437246963, | |
| "grad_norm": 6.851872002967978, | |
| "learning_rate": 7.2874493927125516e-06, | |
| "loss": 13.3438, | |
| "mean_token_accuracy": 0.6068040132522583, | |
| "num_tokens": 21041626.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.2753036437246963, | |
| "eval_loss": 1.690000057220459, | |
| "eval_mean_token_accuracy": 0.6009943783283234, | |
| "eval_num_tokens": 21041626.0, | |
| "eval_runtime": 0.6597, | |
| "eval_samples_per_second": 303.162, | |
| "eval_steps_per_second": 6.063, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.2793522267206479, | |
| "grad_norm": 7.312872948522801, | |
| "learning_rate": 7.246963562753037e-06, | |
| "loss": 13.3906, | |
| "mean_token_accuracy": 0.6119928359985352, | |
| "num_tokens": 21109864.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.2834008097165992, | |
| "grad_norm": 7.258406742426028, | |
| "learning_rate": 7.206477732793523e-06, | |
| "loss": 13.6797, | |
| "mean_token_accuracy": 0.6022233963012695, | |
| "num_tokens": 21180631.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.2874493927125505, | |
| "grad_norm": 7.236283798071484, | |
| "learning_rate": 7.165991902834008e-06, | |
| "loss": 13.5156, | |
| "mean_token_accuracy": 0.607459545135498, | |
| "num_tokens": 21249144.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.291497975708502, | |
| "grad_norm": 6.443354672755848, | |
| "learning_rate": 7.125506072874494e-06, | |
| "loss": 12.5156, | |
| "mean_token_accuracy": 0.6236591339111328, | |
| "num_tokens": 21315677.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.2955465587044535, | |
| "grad_norm": 6.703854250267342, | |
| "learning_rate": 7.0850202429149805e-06, | |
| "loss": 12.0312, | |
| "mean_token_accuracy": 0.6358534693717957, | |
| "num_tokens": 21370804.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2955465587044535, | |
| "eval_loss": 1.6896874904632568, | |
| "eval_mean_token_accuracy": 0.6014674603939056, | |
| "eval_num_tokens": 21370804.0, | |
| "eval_runtime": 0.6354, | |
| "eval_samples_per_second": 314.742, | |
| "eval_steps_per_second": 6.295, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2995951417004048, | |
| "grad_norm": 6.223799662239967, | |
| "learning_rate": 7.044534412955466e-06, | |
| "loss": 13.5234, | |
| "mean_token_accuracy": 0.6001178622245789, | |
| "num_tokens": 21442134.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.3036437246963564, | |
| "grad_norm": 6.245172102328749, | |
| "learning_rate": 7.004048582995951e-06, | |
| "loss": 13.3438, | |
| "mean_token_accuracy": 0.6050729751586914, | |
| "num_tokens": 21505987.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.3076923076923077, | |
| "grad_norm": 7.640593911902389, | |
| "learning_rate": 6.963562753036438e-06, | |
| "loss": 13.125, | |
| "mean_token_accuracy": 0.614285945892334, | |
| "num_tokens": 21571444.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.311740890688259, | |
| "grad_norm": 6.182617066137327, | |
| "learning_rate": 6.923076923076923e-06, | |
| "loss": 12.9219, | |
| "mean_token_accuracy": 0.6171441078186035, | |
| "num_tokens": 21631622.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 6.612642501099769, | |
| "learning_rate": 6.882591093117409e-06, | |
| "loss": 13.1328, | |
| "mean_token_accuracy": 0.6197838187217712, | |
| "num_tokens": 21699600.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "eval_loss": 1.6884374618530273, | |
| "eval_mean_token_accuracy": 0.6019182503223419, | |
| "eval_num_tokens": 21699600.0, | |
| "eval_runtime": 0.6487, | |
| "eval_samples_per_second": 308.327, | |
| "eval_steps_per_second": 6.167, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.319838056680162, | |
| "grad_norm": 6.268645760381665, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 13.0859, | |
| "mean_token_accuracy": 0.6107904314994812, | |
| "num_tokens": 21770004.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.3238866396761133, | |
| "grad_norm": 6.480172792037266, | |
| "learning_rate": 6.801619433198381e-06, | |
| "loss": 13.4766, | |
| "mean_token_accuracy": 0.6045525670051575, | |
| "num_tokens": 21835174.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.3279352226720649, | |
| "grad_norm": 6.89015735331616, | |
| "learning_rate": 6.761133603238867e-06, | |
| "loss": 12.0156, | |
| "mean_token_accuracy": 0.6316675543785095, | |
| "num_tokens": 21901773.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.3319838056680162, | |
| "grad_norm": 6.136078812928923, | |
| "learning_rate": 6.720647773279353e-06, | |
| "loss": 12.8594, | |
| "mean_token_accuracy": 0.6160688400268555, | |
| "num_tokens": 21973454.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.3360323886639676, | |
| "grad_norm": 6.530535217583928, | |
| "learning_rate": 6.6801619433198385e-06, | |
| "loss": 13.3281, | |
| "mean_token_accuracy": 0.608657717704773, | |
| "num_tokens": 22043421.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3360323886639676, | |
| "eval_loss": 1.6871875524520874, | |
| "eval_mean_token_accuracy": 0.6020240485668182, | |
| "eval_num_tokens": 22043421.0, | |
| "eval_runtime": 0.6344, | |
| "eval_samples_per_second": 315.283, | |
| "eval_steps_per_second": 6.306, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3400809716599191, | |
| "grad_norm": 7.032557524136393, | |
| "learning_rate": 6.639676113360325e-06, | |
| "loss": 13.6797, | |
| "mean_token_accuracy": 0.5984200835227966, | |
| "num_tokens": 22112983.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.3441295546558705, | |
| "grad_norm": 6.878568332018199, | |
| "learning_rate": 6.599190283400811e-06, | |
| "loss": 13.625, | |
| "mean_token_accuracy": 0.60277259349823, | |
| "num_tokens": 22174362.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.3481781376518218, | |
| "grad_norm": 6.2805599900614615, | |
| "learning_rate": 6.558704453441296e-06, | |
| "loss": 14.2031, | |
| "mean_token_accuracy": 0.592296838760376, | |
| "num_tokens": 22245929.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.3522267206477734, | |
| "grad_norm": 6.851335318270516, | |
| "learning_rate": 6.518218623481782e-06, | |
| "loss": 13.8516, | |
| "mean_token_accuracy": 0.6001754403114319, | |
| "num_tokens": 22315536.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.3562753036437247, | |
| "grad_norm": 6.690083307280702, | |
| "learning_rate": 6.4777327935222675e-06, | |
| "loss": 13.9688, | |
| "mean_token_accuracy": 0.5958229303359985, | |
| "num_tokens": 22381388.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.3562753036437247, | |
| "eval_loss": 1.6865625381469727, | |
| "eval_mean_token_accuracy": 0.601932093501091, | |
| "eval_num_tokens": 22381388.0, | |
| "eval_runtime": 0.6419, | |
| "eval_samples_per_second": 311.575, | |
| "eval_steps_per_second": 6.232, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.360323886639676, | |
| "grad_norm": 7.1780394505714336, | |
| "learning_rate": 6.437246963562754e-06, | |
| "loss": 12.5156, | |
| "mean_token_accuracy": 0.6246969699859619, | |
| "num_tokens": 22448683.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.3643724696356276, | |
| "grad_norm": 7.289906239024176, | |
| "learning_rate": 6.39676113360324e-06, | |
| "loss": 13.5469, | |
| "mean_token_accuracy": 0.6012796759605408, | |
| "num_tokens": 22510793.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 6.901589521281514, | |
| "learning_rate": 6.356275303643725e-06, | |
| "loss": 13.5234, | |
| "mean_token_accuracy": 0.6072494983673096, | |
| "num_tokens": 22579193.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.3724696356275303, | |
| "grad_norm": 6.256711193594864, | |
| "learning_rate": 6.31578947368421e-06, | |
| "loss": 12.7188, | |
| "mean_token_accuracy": 0.622461199760437, | |
| "num_tokens": 22649271.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.376518218623482, | |
| "grad_norm": 5.928363958207777, | |
| "learning_rate": 6.275303643724697e-06, | |
| "loss": 12.8203, | |
| "mean_token_accuracy": 0.6171985268592834, | |
| "num_tokens": 22721597.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.376518218623482, | |
| "eval_loss": 1.6862499713897705, | |
| "eval_mean_token_accuracy": 0.6023992896080017, | |
| "eval_num_tokens": 22721597.0, | |
| "eval_runtime": 0.6436, | |
| "eval_samples_per_second": 310.773, | |
| "eval_steps_per_second": 6.215, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3805668016194332, | |
| "grad_norm": 7.046994398153724, | |
| "learning_rate": 6.234817813765183e-06, | |
| "loss": 13.5625, | |
| "mean_token_accuracy": 0.6081273555755615, | |
| "num_tokens": 22788300.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.3846153846153846, | |
| "grad_norm": 5.679939194503434, | |
| "learning_rate": 6.194331983805668e-06, | |
| "loss": 12.5, | |
| "mean_token_accuracy": 0.6215187907218933, | |
| "num_tokens": 22859065.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.3886639676113361, | |
| "grad_norm": 6.613152950818566, | |
| "learning_rate": 6.153846153846155e-06, | |
| "loss": 12.8125, | |
| "mean_token_accuracy": 0.6154517531394958, | |
| "num_tokens": 22923264.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.3927125506072875, | |
| "grad_norm": 6.167835687979181, | |
| "learning_rate": 6.11336032388664e-06, | |
| "loss": 12.8047, | |
| "mean_token_accuracy": 0.6202464699745178, | |
| "num_tokens": 22980128.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.3967611336032388, | |
| "grad_norm": 6.861084643476399, | |
| "learning_rate": 6.0728744939271254e-06, | |
| "loss": 12.0469, | |
| "mean_token_accuracy": 0.6357682347297668, | |
| "num_tokens": 23038976.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.3967611336032388, | |
| "eval_loss": 1.6846874952316284, | |
| "eval_mean_token_accuracy": 0.6027306467294693, | |
| "eval_num_tokens": 23038976.0, | |
| "eval_runtime": 0.6479, | |
| "eval_samples_per_second": 308.696, | |
| "eval_steps_per_second": 6.174, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.4008097165991904, | |
| "grad_norm": 6.0083674637757065, | |
| "learning_rate": 6.0323886639676124e-06, | |
| "loss": 12.8516, | |
| "mean_token_accuracy": 0.6229951977729797, | |
| "num_tokens": 23113673.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.4048582995951417, | |
| "grad_norm": 6.91334940259032, | |
| "learning_rate": 5.991902834008098e-06, | |
| "loss": 13.6016, | |
| "mean_token_accuracy": 0.6017605066299438, | |
| "num_tokens": 23182354.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.408906882591093, | |
| "grad_norm": 7.225053680436044, | |
| "learning_rate": 5.951417004048583e-06, | |
| "loss": 13.2812, | |
| "mean_token_accuracy": 0.6096216440200806, | |
| "num_tokens": 23244819.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.4129554655870447, | |
| "grad_norm": 6.910314738528923, | |
| "learning_rate": 5.91093117408907e-06, | |
| "loss": 12.5781, | |
| "mean_token_accuracy": 0.6233025193214417, | |
| "num_tokens": 23301143.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.417004048582996, | |
| "grad_norm": 13.172174214194689, | |
| "learning_rate": 5.870445344129555e-06, | |
| "loss": 13.7188, | |
| "mean_token_accuracy": 0.5989026427268982, | |
| "num_tokens": 23373743.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.417004048582996, | |
| "eval_loss": 1.683750033378601, | |
| "eval_mean_token_accuracy": 0.6025040000677109, | |
| "eval_num_tokens": 23373743.0, | |
| "eval_runtime": 0.638, | |
| "eval_samples_per_second": 313.493, | |
| "eval_steps_per_second": 6.27, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 7.059247196559215, | |
| "learning_rate": 5.8299595141700406e-06, | |
| "loss": 12.2812, | |
| "mean_token_accuracy": 0.6239601373672485, | |
| "num_tokens": 23441002.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.425101214574899, | |
| "grad_norm": 6.949818046383843, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 12.3125, | |
| "mean_token_accuracy": 0.6313892006874084, | |
| "num_tokens": 23495161.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.4291497975708503, | |
| "grad_norm": 6.059843894633797, | |
| "learning_rate": 5.748987854251013e-06, | |
| "loss": 13.2344, | |
| "mean_token_accuracy": 0.6091973781585693, | |
| "num_tokens": 23564636.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.4331983805668016, | |
| "grad_norm": 6.849778295075088, | |
| "learning_rate": 5.708502024291498e-06, | |
| "loss": 14.1484, | |
| "mean_token_accuracy": 0.5898481011390686, | |
| "num_tokens": 23634618.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.4372469635627532, | |
| "grad_norm": 7.196515102955155, | |
| "learning_rate": 5.668016194331984e-06, | |
| "loss": 13.4844, | |
| "mean_token_accuracy": 0.608712911605835, | |
| "num_tokens": 23697715.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.4372469635627532, | |
| "eval_loss": 1.683437466621399, | |
| "eval_mean_token_accuracy": 0.6026097536087036, | |
| "eval_num_tokens": 23697715.0, | |
| "eval_runtime": 0.6368, | |
| "eval_samples_per_second": 314.046, | |
| "eval_steps_per_second": 6.281, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.4412955465587045, | |
| "grad_norm": 6.083851158318087, | |
| "learning_rate": 5.6275303643724695e-06, | |
| "loss": 13.1719, | |
| "mean_token_accuracy": 0.6111401319503784, | |
| "num_tokens": 23762805.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.4453441295546559, | |
| "grad_norm": 6.3854204394565865, | |
| "learning_rate": 5.5870445344129565e-06, | |
| "loss": 14.1484, | |
| "mean_token_accuracy": 0.5870954990386963, | |
| "num_tokens": 23824801.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.4493927125506074, | |
| "grad_norm": 6.205029195096519, | |
| "learning_rate": 5.546558704453442e-06, | |
| "loss": 12.5078, | |
| "mean_token_accuracy": 0.6245027184486389, | |
| "num_tokens": 23893486.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.4534412955465588, | |
| "grad_norm": 6.498352605010108, | |
| "learning_rate": 5.506072874493927e-06, | |
| "loss": 13.1406, | |
| "mean_token_accuracy": 0.6103119254112244, | |
| "num_tokens": 23956739.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.45748987854251, | |
| "grad_norm": 7.863658695213157, | |
| "learning_rate": 5.465587044534414e-06, | |
| "loss": 12.2969, | |
| "mean_token_accuracy": 0.6250467896461487, | |
| "num_tokens": 24015613.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.45748987854251, | |
| "eval_loss": 1.6828124523162842, | |
| "eval_mean_token_accuracy": 0.6025930494070053, | |
| "eval_num_tokens": 24015613.0, | |
| "eval_runtime": 0.6414, | |
| "eval_samples_per_second": 311.802, | |
| "eval_steps_per_second": 6.236, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4615384615384617, | |
| "grad_norm": 6.004778979711357, | |
| "learning_rate": 5.425101214574899e-06, | |
| "loss": 13.375, | |
| "mean_token_accuracy": 0.6013794541358948, | |
| "num_tokens": 24093533.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.465587044534413, | |
| "grad_norm": 6.524314866552431, | |
| "learning_rate": 5.384615384615385e-06, | |
| "loss": 13.1094, | |
| "mean_token_accuracy": 0.6128621101379395, | |
| "num_tokens": 24172029.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.4696356275303644, | |
| "grad_norm": 5.757817859771559, | |
| "learning_rate": 5.344129554655872e-06, | |
| "loss": 12.7188, | |
| "mean_token_accuracy": 0.6142829060554504, | |
| "num_tokens": 24238844.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 5.843888667434192, | |
| "learning_rate": 5.303643724696357e-06, | |
| "loss": 12.4688, | |
| "mean_token_accuracy": 0.6290068030357361, | |
| "num_tokens": 24308602.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.4777327935222673, | |
| "grad_norm": 7.095669449455518, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 13.2891, | |
| "mean_token_accuracy": 0.6107263565063477, | |
| "num_tokens": 24381011.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.4777327935222673, | |
| "eval_loss": 1.6828124523162842, | |
| "eval_mean_token_accuracy": 0.6028460413217545, | |
| "eval_num_tokens": 24381011.0, | |
| "eval_runtime": 0.6426, | |
| "eval_samples_per_second": 311.246, | |
| "eval_steps_per_second": 6.225, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.4817813765182186, | |
| "grad_norm": 7.089500617007727, | |
| "learning_rate": 5.222672064777329e-06, | |
| "loss": 13.2422, | |
| "mean_token_accuracy": 0.6082656383514404, | |
| "num_tokens": 24447301.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.48582995951417, | |
| "grad_norm": 5.759008858515851, | |
| "learning_rate": 5.1821862348178145e-06, | |
| "loss": 11.7812, | |
| "mean_token_accuracy": 0.6373518109321594, | |
| "num_tokens": 24521514.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.4898785425101215, | |
| "grad_norm": 7.257765350400875, | |
| "learning_rate": 5.1417004048583e-06, | |
| "loss": 13.4219, | |
| "mean_token_accuracy": 0.6083607077598572, | |
| "num_tokens": 24587266.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.4939271255060729, | |
| "grad_norm": 6.617774961612806, | |
| "learning_rate": 5.101214574898786e-06, | |
| "loss": 11.9922, | |
| "mean_token_accuracy": 0.6397205591201782, | |
| "num_tokens": 24647883.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.4979757085020242, | |
| "grad_norm": 6.034070184034884, | |
| "learning_rate": 5.060728744939272e-06, | |
| "loss": 14.0781, | |
| "mean_token_accuracy": 0.5942122936248779, | |
| "num_tokens": 24723728.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4979757085020242, | |
| "eval_loss": 1.682187557220459, | |
| "eval_mean_token_accuracy": 0.6028562039136887, | |
| "eval_num_tokens": 24723728.0, | |
| "eval_runtime": 0.6595, | |
| "eval_samples_per_second": 303.264, | |
| "eval_steps_per_second": 6.065, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5020242914979756, | |
| "grad_norm": 7.413748338165612, | |
| "learning_rate": 5.020242914979757e-06, | |
| "loss": 13.0234, | |
| "mean_token_accuracy": 0.6105521321296692, | |
| "num_tokens": 24786206.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.5060728744939271, | |
| "grad_norm": 7.4171774757668745, | |
| "learning_rate": 4.9797570850202435e-06, | |
| "loss": 13.1641, | |
| "mean_token_accuracy": 0.6107207536697388, | |
| "num_tokens": 24851452.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.5101214574898787, | |
| "grad_norm": 6.678125850261295, | |
| "learning_rate": 4.939271255060729e-06, | |
| "loss": 12.1406, | |
| "mean_token_accuracy": 0.6283853650093079, | |
| "num_tokens": 24912625.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.5141700404858298, | |
| "grad_norm": 6.7202926787891375, | |
| "learning_rate": 4.898785425101215e-06, | |
| "loss": 13.0469, | |
| "mean_token_accuracy": 0.6010765433311462, | |
| "num_tokens": 24976596.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.5182186234817814, | |
| "grad_norm": 7.189454405906928, | |
| "learning_rate": 4.8582995951417e-06, | |
| "loss": 11.9141, | |
| "mean_token_accuracy": 0.6381216049194336, | |
| "num_tokens": 25030493.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.5182186234817814, | |
| "eval_loss": 1.6809375286102295, | |
| "eval_mean_token_accuracy": 0.6026807576417923, | |
| "eval_num_tokens": 25030493.0, | |
| "eval_runtime": 0.6393, | |
| "eval_samples_per_second": 312.855, | |
| "eval_steps_per_second": 6.257, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.522267206477733, | |
| "grad_norm": 6.309792255182094, | |
| "learning_rate": 4.817813765182186e-06, | |
| "loss": 12.0938, | |
| "mean_token_accuracy": 0.6288850903511047, | |
| "num_tokens": 25094133.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 7.06660860300255, | |
| "learning_rate": 4.7773279352226725e-06, | |
| "loss": 12.7812, | |
| "mean_token_accuracy": 0.6158961057662964, | |
| "num_tokens": 25155860.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.5303643724696356, | |
| "grad_norm": 8.076900211471674, | |
| "learning_rate": 4.736842105263158e-06, | |
| "loss": 12.5859, | |
| "mean_token_accuracy": 0.6151171922683716, | |
| "num_tokens": 25211146.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.5344129554655872, | |
| "grad_norm": 6.221731683405277, | |
| "learning_rate": 4.696356275303644e-06, | |
| "loss": 12.4609, | |
| "mean_token_accuracy": 0.6272507905960083, | |
| "num_tokens": 25280353.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 7.4685425851793665, | |
| "learning_rate": 4.65587044534413e-06, | |
| "loss": 13.1562, | |
| "mean_token_accuracy": 0.6171966195106506, | |
| "num_tokens": 25335393.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "eval_loss": 1.6803125143051147, | |
| "eval_mean_token_accuracy": 0.6025642305612564, | |
| "eval_num_tokens": 25335393.0, | |
| "eval_runtime": 0.6333, | |
| "eval_samples_per_second": 315.815, | |
| "eval_steps_per_second": 6.316, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.54251012145749, | |
| "grad_norm": 6.476918972266226, | |
| "learning_rate": 4.615384615384616e-06, | |
| "loss": 12.8594, | |
| "mean_token_accuracy": 0.6220955848693848, | |
| "num_tokens": 25413527.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.5465587044534415, | |
| "grad_norm": 6.648186663706087, | |
| "learning_rate": 4.5748987854251014e-06, | |
| "loss": 12.5703, | |
| "mean_token_accuracy": 0.6231284141540527, | |
| "num_tokens": 25480245.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.5506072874493926, | |
| "grad_norm": 7.367613086661741, | |
| "learning_rate": 4.534412955465588e-06, | |
| "loss": 13.6406, | |
| "mean_token_accuracy": 0.6029731631278992, | |
| "num_tokens": 25547914.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.5546558704453441, | |
| "grad_norm": 5.503766366101462, | |
| "learning_rate": 4.493927125506074e-06, | |
| "loss": 13.8125, | |
| "mean_token_accuracy": 0.5945603847503662, | |
| "num_tokens": 25631660.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.5587044534412957, | |
| "grad_norm": 6.8977689320409885, | |
| "learning_rate": 4.453441295546559e-06, | |
| "loss": 13.5156, | |
| "mean_token_accuracy": 0.6095985174179077, | |
| "num_tokens": 25696588.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.5587044534412957, | |
| "eval_loss": 1.6799999475479126, | |
| "eval_mean_token_accuracy": 0.6023529767990112, | |
| "eval_num_tokens": 25696588.0, | |
| "eval_runtime": 0.636, | |
| "eval_samples_per_second": 314.454, | |
| "eval_steps_per_second": 6.289, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.5627530364372468, | |
| "grad_norm": 6.227043197214259, | |
| "learning_rate": 4.412955465587045e-06, | |
| "loss": 13.2656, | |
| "mean_token_accuracy": 0.6109288930892944, | |
| "num_tokens": 25764637.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.5668016194331984, | |
| "grad_norm": 6.437168132260638, | |
| "learning_rate": 4.372469635627531e-06, | |
| "loss": 13.0156, | |
| "mean_token_accuracy": 0.6073063611984253, | |
| "num_tokens": 25828262.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.5708502024291497, | |
| "grad_norm": 6.443176214251158, | |
| "learning_rate": 4.3319838056680166e-06, | |
| "loss": 12.5391, | |
| "mean_token_accuracy": 0.6221445798873901, | |
| "num_tokens": 25894997.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.574898785425101, | |
| "grad_norm": 6.541270347260077, | |
| "learning_rate": 4.291497975708503e-06, | |
| "loss": 12.5625, | |
| "mean_token_accuracy": 0.6267231106758118, | |
| "num_tokens": 25956940.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 6.622129593963462, | |
| "learning_rate": 4.251012145748988e-06, | |
| "loss": 12.5, | |
| "mean_token_accuracy": 0.620368242263794, | |
| "num_tokens": 26022017.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "eval_loss": 1.6799999475479126, | |
| "eval_mean_token_accuracy": 0.6024383455514908, | |
| "eval_num_tokens": 26022017.0, | |
| "eval_runtime": 0.6355, | |
| "eval_samples_per_second": 314.693, | |
| "eval_steps_per_second": 6.294, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.582995951417004, | |
| "grad_norm": 5.683943779857191, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 12.7578, | |
| "mean_token_accuracy": 0.6183803677558899, | |
| "num_tokens": 26089381.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.5870445344129553, | |
| "grad_norm": 6.4439629905284, | |
| "learning_rate": 4.170040485829959e-06, | |
| "loss": 13.5859, | |
| "mean_token_accuracy": 0.6022599339485168, | |
| "num_tokens": 26160421.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.591093117408907, | |
| "grad_norm": 6.482307740702656, | |
| "learning_rate": 4.1295546558704455e-06, | |
| "loss": 13.0938, | |
| "mean_token_accuracy": 0.6097571849822998, | |
| "num_tokens": 26227164.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.5951417004048583, | |
| "grad_norm": 5.6243349055097775, | |
| "learning_rate": 4.089068825910931e-06, | |
| "loss": 12.7969, | |
| "mean_token_accuracy": 0.6154806613922119, | |
| "num_tokens": 26297716.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.5991902834008096, | |
| "grad_norm": 7.1839751833286725, | |
| "learning_rate": 4.048582995951417e-06, | |
| "loss": 12.5312, | |
| "mean_token_accuracy": 0.627181351184845, | |
| "num_tokens": 26358579.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.5991902834008096, | |
| "eval_loss": 1.6775000095367432, | |
| "eval_mean_token_accuracy": 0.6028738915920258, | |
| "eval_num_tokens": 26358579.0, | |
| "eval_runtime": 0.6349, | |
| "eval_samples_per_second": 314.994, | |
| "eval_steps_per_second": 6.3, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.6032388663967612, | |
| "grad_norm": 6.286540348376537, | |
| "learning_rate": 4.008097165991903e-06, | |
| "loss": 12.8438, | |
| "mean_token_accuracy": 0.618080198764801, | |
| "num_tokens": 26423963.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.6072874493927125, | |
| "grad_norm": 8.094171440991735, | |
| "learning_rate": 3.967611336032389e-06, | |
| "loss": 12.6172, | |
| "mean_token_accuracy": 0.6203653216362, | |
| "num_tokens": 26473680.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.6113360323886639, | |
| "grad_norm": 6.734637207480726, | |
| "learning_rate": 3.9271255060728745e-06, | |
| "loss": 13.1016, | |
| "mean_token_accuracy": 0.6167324185371399, | |
| "num_tokens": 26531787.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 6.248322358568084, | |
| "learning_rate": 3.886639676113361e-06, | |
| "loss": 12.0234, | |
| "mean_token_accuracy": 0.6334991455078125, | |
| "num_tokens": 26589739.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.6194331983805668, | |
| "grad_norm": 7.003524821469698, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 12.5938, | |
| "mean_token_accuracy": 0.6244420409202576, | |
| "num_tokens": 26650518.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6194331983805668, | |
| "eval_loss": 1.6775000095367432, | |
| "eval_mean_token_accuracy": 0.6030516028404236, | |
| "eval_num_tokens": 26650518.0, | |
| "eval_runtime": 0.6363, | |
| "eval_samples_per_second": 314.294, | |
| "eval_steps_per_second": 6.286, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.623481781376518, | |
| "grad_norm": 6.575500256754098, | |
| "learning_rate": 3.805668016194332e-06, | |
| "loss": 14.2031, | |
| "mean_token_accuracy": 0.5973594784736633, | |
| "num_tokens": 26721628.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.6275303643724697, | |
| "grad_norm": 6.399615806731187, | |
| "learning_rate": 3.7651821862348182e-06, | |
| "loss": 13.0859, | |
| "mean_token_accuracy": 0.6143900752067566, | |
| "num_tokens": 26795479.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 6.108759765220742, | |
| "learning_rate": 3.724696356275304e-06, | |
| "loss": 12.5, | |
| "mean_token_accuracy": 0.6269562244415283, | |
| "num_tokens": 26858994.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.6356275303643724, | |
| "grad_norm": 6.378724273369664, | |
| "learning_rate": 3.6842105263157896e-06, | |
| "loss": 12.7422, | |
| "mean_token_accuracy": 0.6249410510063171, | |
| "num_tokens": 26924821.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.639676113360324, | |
| "grad_norm": 6.284255806434736, | |
| "learning_rate": 3.6437246963562758e-06, | |
| "loss": 12.1797, | |
| "mean_token_accuracy": 0.6245778203010559, | |
| "num_tokens": 26981139.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.639676113360324, | |
| "eval_loss": 1.6775000095367432, | |
| "eval_mean_token_accuracy": 0.6031172126531601, | |
| "eval_num_tokens": 26981139.0, | |
| "eval_runtime": 0.6612, | |
| "eval_samples_per_second": 302.482, | |
| "eval_steps_per_second": 6.05, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.6437246963562753, | |
| "grad_norm": 6.284046381701035, | |
| "learning_rate": 3.6032388663967615e-06, | |
| "loss": 12.8438, | |
| "mean_token_accuracy": 0.6137465238571167, | |
| "num_tokens": 27036897.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.6477732793522266, | |
| "grad_norm": 6.192144168674015, | |
| "learning_rate": 3.562753036437247e-06, | |
| "loss": 13.4453, | |
| "mean_token_accuracy": 0.6057220697402954, | |
| "num_tokens": 27113856.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.6518218623481782, | |
| "grad_norm": 7.747379888410599, | |
| "learning_rate": 3.522267206477733e-06, | |
| "loss": 13.2812, | |
| "mean_token_accuracy": 0.6113630533218384, | |
| "num_tokens": 27187122.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.6558704453441295, | |
| "grad_norm": 7.619435387018495, | |
| "learning_rate": 3.481781376518219e-06, | |
| "loss": 12.625, | |
| "mean_token_accuracy": 0.6203478574752808, | |
| "num_tokens": 27245197.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.6599190283400809, | |
| "grad_norm": 6.774444594833395, | |
| "learning_rate": 3.4412955465587043e-06, | |
| "loss": 13.4766, | |
| "mean_token_accuracy": 0.6078824996948242, | |
| "num_tokens": 27310976.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6599190283400809, | |
| "eval_loss": 1.6768749952316284, | |
| "eval_mean_token_accuracy": 0.6029177159070969, | |
| "eval_num_tokens": 27310976.0, | |
| "eval_runtime": 0.652, | |
| "eval_samples_per_second": 306.731, | |
| "eval_steps_per_second": 6.135, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6639676113360324, | |
| "grad_norm": 6.546428537195409, | |
| "learning_rate": 3.4008097165991905e-06, | |
| "loss": 13.0, | |
| "mean_token_accuracy": 0.6162744760513306, | |
| "num_tokens": 27381838.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.6680161943319838, | |
| "grad_norm": 5.661705927031424, | |
| "learning_rate": 3.3603238866396766e-06, | |
| "loss": 12.9844, | |
| "mean_token_accuracy": 0.6101025938987732, | |
| "num_tokens": 27457427.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.6720647773279351, | |
| "grad_norm": 6.634404347857771, | |
| "learning_rate": 3.3198380566801623e-06, | |
| "loss": 13.2344, | |
| "mean_token_accuracy": 0.6103484034538269, | |
| "num_tokens": 27525598.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.6761133603238867, | |
| "grad_norm": 6.4588054803695165, | |
| "learning_rate": 3.279352226720648e-06, | |
| "loss": 13.0859, | |
| "mean_token_accuracy": 0.6082247495651245, | |
| "num_tokens": 27586138.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.680161943319838, | |
| "grad_norm": 6.229468320759774, | |
| "learning_rate": 3.2388663967611337e-06, | |
| "loss": 13.2578, | |
| "mean_token_accuracy": 0.6039714813232422, | |
| "num_tokens": 27656654.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.680161943319838, | |
| "eval_loss": 1.6768749952316284, | |
| "eval_mean_token_accuracy": 0.603041797876358, | |
| "eval_num_tokens": 27656654.0, | |
| "eval_runtime": 0.6416, | |
| "eval_samples_per_second": 311.713, | |
| "eval_steps_per_second": 6.234, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 6.530706970746754, | |
| "learning_rate": 3.19838056680162e-06, | |
| "loss": 12.9609, | |
| "mean_token_accuracy": 0.616111159324646, | |
| "num_tokens": 27719395.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.688259109311741, | |
| "grad_norm": 5.974041721296059, | |
| "learning_rate": 3.157894736842105e-06, | |
| "loss": 13.0781, | |
| "mean_token_accuracy": 0.6097447872161865, | |
| "num_tokens": 27805700.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.6923076923076923, | |
| "grad_norm": 6.9520061051049655, | |
| "learning_rate": 3.1174089068825913e-06, | |
| "loss": 13.1328, | |
| "mean_token_accuracy": 0.6185556054115295, | |
| "num_tokens": 27870607.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.6963562753036436, | |
| "grad_norm": 6.687485877625691, | |
| "learning_rate": 3.0769230769230774e-06, | |
| "loss": 13.125, | |
| "mean_token_accuracy": 0.6092858910560608, | |
| "num_tokens": 27935199.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.7004048582995952, | |
| "grad_norm": 5.89529220246302, | |
| "learning_rate": 3.0364372469635627e-06, | |
| "loss": 12.7344, | |
| "mean_token_accuracy": 0.6122572422027588, | |
| "num_tokens": 27999633.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.7004048582995952, | |
| "eval_loss": 1.6762499809265137, | |
| "eval_mean_token_accuracy": 0.6032748818397522, | |
| "eval_num_tokens": 27999633.0, | |
| "eval_runtime": 0.6365, | |
| "eval_samples_per_second": 314.198, | |
| "eval_steps_per_second": 6.284, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.7044534412955465, | |
| "grad_norm": 5.97670859064412, | |
| "learning_rate": 2.995951417004049e-06, | |
| "loss": 13.6172, | |
| "mean_token_accuracy": 0.6090903878211975, | |
| "num_tokens": 28070629.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.708502024291498, | |
| "grad_norm": 6.35673530964282, | |
| "learning_rate": 2.955465587044535e-06, | |
| "loss": 14.1406, | |
| "mean_token_accuracy": 0.5912197828292847, | |
| "num_tokens": 28143265.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.7125506072874495, | |
| "grad_norm": 6.087049071022692, | |
| "learning_rate": 2.9149797570850203e-06, | |
| "loss": 12.7266, | |
| "mean_token_accuracy": 0.6160033345222473, | |
| "num_tokens": 28217426.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.7165991902834008, | |
| "grad_norm": 6.5904480079951915, | |
| "learning_rate": 2.8744939271255064e-06, | |
| "loss": 12.9141, | |
| "mean_token_accuracy": 0.6187474727630615, | |
| "num_tokens": 28286022.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.7206477732793521, | |
| "grad_norm": 6.195739080649372, | |
| "learning_rate": 2.834008097165992e-06, | |
| "loss": 13.5859, | |
| "mean_token_accuracy": 0.6009435057640076, | |
| "num_tokens": 28356036.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.7206477732793521, | |
| "eval_loss": 1.6762499809265137, | |
| "eval_mean_token_accuracy": 0.6033691614866257, | |
| "eval_num_tokens": 28356036.0, | |
| "eval_runtime": 0.641, | |
| "eval_samples_per_second": 312.0, | |
| "eval_steps_per_second": 6.24, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.7246963562753037, | |
| "grad_norm": 6.307993289907047, | |
| "learning_rate": 2.7935222672064783e-06, | |
| "loss": 13.1875, | |
| "mean_token_accuracy": 0.6147733926773071, | |
| "num_tokens": 28430355.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.728744939271255, | |
| "grad_norm": 6.289447300448821, | |
| "learning_rate": 2.7530364372469636e-06, | |
| "loss": 12.5156, | |
| "mean_token_accuracy": 0.6260796785354614, | |
| "num_tokens": 28494906.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.7327935222672064, | |
| "grad_norm": 6.826350614499696, | |
| "learning_rate": 2.7125506072874497e-06, | |
| "loss": 12.6094, | |
| "mean_token_accuracy": 0.6202298998832703, | |
| "num_tokens": 28562395.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 6.595176524843769, | |
| "learning_rate": 2.672064777327936e-06, | |
| "loss": 12.8047, | |
| "mean_token_accuracy": 0.6235784292221069, | |
| "num_tokens": 28624889.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.7408906882591093, | |
| "grad_norm": 5.691303853471213, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 13.2656, | |
| "mean_token_accuracy": 0.6069343686103821, | |
| "num_tokens": 28698788.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7408906882591093, | |
| "eval_loss": 1.675624966621399, | |
| "eval_mean_token_accuracy": 0.603339210152626, | |
| "eval_num_tokens": 28698788.0, | |
| "eval_runtime": 0.6338, | |
| "eval_samples_per_second": 315.581, | |
| "eval_steps_per_second": 6.312, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7449392712550607, | |
| "grad_norm": 5.997572345174048, | |
| "learning_rate": 2.5910931174089072e-06, | |
| "loss": 12.9844, | |
| "mean_token_accuracy": 0.6104946732521057, | |
| "num_tokens": 28773747.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.7489878542510122, | |
| "grad_norm": 7.249965290425393, | |
| "learning_rate": 2.550607287449393e-06, | |
| "loss": 13.4297, | |
| "mean_token_accuracy": 0.6104491353034973, | |
| "num_tokens": 28842544.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.7530364372469636, | |
| "grad_norm": 6.000414268983646, | |
| "learning_rate": 2.5101214574898787e-06, | |
| "loss": 12.5781, | |
| "mean_token_accuracy": 0.6204512119293213, | |
| "num_tokens": 28912462.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.757085020242915, | |
| "grad_norm": 6.118904164123435, | |
| "learning_rate": 2.4696356275303644e-06, | |
| "loss": 12.7109, | |
| "mean_token_accuracy": 0.6210896968841553, | |
| "num_tokens": 28992347.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.7611336032388665, | |
| "grad_norm": 6.1441994160988065, | |
| "learning_rate": 2.42914979757085e-06, | |
| "loss": 11.4531, | |
| "mean_token_accuracy": 0.6428948044776917, | |
| "num_tokens": 29066420.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.7611336032388665, | |
| "eval_loss": 1.6749999523162842, | |
| "eval_mean_token_accuracy": 0.6034704595804214, | |
| "eval_num_tokens": 29066420.0, | |
| "eval_runtime": 0.6572, | |
| "eval_samples_per_second": 304.314, | |
| "eval_steps_per_second": 6.086, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.7651821862348178, | |
| "grad_norm": 8.29981852170875, | |
| "learning_rate": 2.3886639676113362e-06, | |
| "loss": 12.7266, | |
| "mean_token_accuracy": 0.6216300129890442, | |
| "num_tokens": 29140409.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.7692307692307692, | |
| "grad_norm": 7.051595151204688, | |
| "learning_rate": 2.348178137651822e-06, | |
| "loss": 12.9375, | |
| "mean_token_accuracy": 0.6193385124206543, | |
| "num_tokens": 29200219.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.7732793522267207, | |
| "grad_norm": 6.304795255922526, | |
| "learning_rate": 2.307692307692308e-06, | |
| "loss": 12.4688, | |
| "mean_token_accuracy": 0.6236432194709778, | |
| "num_tokens": 29271314.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.777327935222672, | |
| "grad_norm": 5.556856434206601, | |
| "learning_rate": 2.267206477732794e-06, | |
| "loss": 12.4609, | |
| "mean_token_accuracy": 0.6236319541931152, | |
| "num_tokens": 29342739.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.7813765182186234, | |
| "grad_norm": 7.6181275587053445, | |
| "learning_rate": 2.2267206477732795e-06, | |
| "loss": 12.3359, | |
| "mean_token_accuracy": 0.6281608939170837, | |
| "num_tokens": 29399868.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7813765182186234, | |
| "eval_loss": 1.6746875047683716, | |
| "eval_mean_token_accuracy": 0.6035052537918091, | |
| "eval_num_tokens": 29399868.0, | |
| "eval_runtime": 0.6412, | |
| "eval_samples_per_second": 311.93, | |
| "eval_steps_per_second": 6.239, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.785425101214575, | |
| "grad_norm": 7.163759421924407, | |
| "learning_rate": 2.1862348178137656e-06, | |
| "loss": 13.4844, | |
| "mean_token_accuracy": 0.6067489385604858, | |
| "num_tokens": 29471825.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 5.980379834577915, | |
| "learning_rate": 2.1457489878542513e-06, | |
| "loss": 12.6953, | |
| "mean_token_accuracy": 0.6232491135597229, | |
| "num_tokens": 29543780.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.7935222672064777, | |
| "grad_norm": 5.658389056927855, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 12.9844, | |
| "mean_token_accuracy": 0.6110087633132935, | |
| "num_tokens": 29620910.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.7975708502024292, | |
| "grad_norm": 5.632632433555104, | |
| "learning_rate": 2.0647773279352228e-06, | |
| "loss": 12.3203, | |
| "mean_token_accuracy": 0.6279122829437256, | |
| "num_tokens": 29695744.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.8016194331983806, | |
| "grad_norm": 5.67469055605223, | |
| "learning_rate": 2.0242914979757085e-06, | |
| "loss": 12.5547, | |
| "mean_token_accuracy": 0.6283406615257263, | |
| "num_tokens": 29767874.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.8016194331983806, | |
| "eval_loss": 1.6746875047683716, | |
| "eval_mean_token_accuracy": 0.6034050285816193, | |
| "eval_num_tokens": 29767874.0, | |
| "eval_runtime": 0.6422, | |
| "eval_samples_per_second": 311.443, | |
| "eval_steps_per_second": 6.229, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.805668016194332, | |
| "grad_norm": 6.569962521896024, | |
| "learning_rate": 1.9838056680161946e-06, | |
| "loss": 11.7422, | |
| "mean_token_accuracy": 0.6383576393127441, | |
| "num_tokens": 29831675.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.8097165991902835, | |
| "grad_norm": 6.86808486740946, | |
| "learning_rate": 1.9433198380566803e-06, | |
| "loss": 13.7891, | |
| "mean_token_accuracy": 0.5985156893730164, | |
| "num_tokens": 29900728.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.8137651821862348, | |
| "grad_norm": 5.57356193397258, | |
| "learning_rate": 1.902834008097166e-06, | |
| "loss": 13.1406, | |
| "mean_token_accuracy": 0.6123852133750916, | |
| "num_tokens": 29970601.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.8178137651821862, | |
| "grad_norm": 6.094864274620779, | |
| "learning_rate": 1.862348178137652e-06, | |
| "loss": 13.1562, | |
| "mean_token_accuracy": 0.613720178604126, | |
| "num_tokens": 30034673.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.8218623481781377, | |
| "grad_norm": 7.163324826123694, | |
| "learning_rate": 1.8218623481781379e-06, | |
| "loss": 12.1641, | |
| "mean_token_accuracy": 0.6280580759048462, | |
| "num_tokens": 30091840.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.8218623481781377, | |
| "eval_loss": 1.6737500429153442, | |
| "eval_mean_token_accuracy": 0.6033113747835159, | |
| "eval_num_tokens": 30091840.0, | |
| "eval_runtime": 0.6388, | |
| "eval_samples_per_second": 313.097, | |
| "eval_steps_per_second": 6.262, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.825910931174089, | |
| "grad_norm": 5.721465202296356, | |
| "learning_rate": 1.7813765182186236e-06, | |
| "loss": 13.1172, | |
| "mean_token_accuracy": 0.6090490818023682, | |
| "num_tokens": 30160795.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.8299595141700404, | |
| "grad_norm": 5.834496019461034, | |
| "learning_rate": 1.7408906882591095e-06, | |
| "loss": 11.6094, | |
| "mean_token_accuracy": 0.6369089484214783, | |
| "num_tokens": 30223207.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.834008097165992, | |
| "grad_norm": 5.989944536955842, | |
| "learning_rate": 1.7004048582995952e-06, | |
| "loss": 12.8125, | |
| "mean_token_accuracy": 0.6168772578239441, | |
| "num_tokens": 30292998.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.8380566801619433, | |
| "grad_norm": 6.175252808359519, | |
| "learning_rate": 1.6599190283400812e-06, | |
| "loss": 12.4141, | |
| "mean_token_accuracy": 0.6250400543212891, | |
| "num_tokens": 30358645.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 6.320123552822768, | |
| "learning_rate": 1.6194331983805669e-06, | |
| "loss": 12.7969, | |
| "mean_token_accuracy": 0.6187460422515869, | |
| "num_tokens": 30426126.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "eval_loss": 1.6737500429153442, | |
| "eval_mean_token_accuracy": 0.6035889238119125, | |
| "eval_num_tokens": 30426126.0, | |
| "eval_runtime": 0.6462, | |
| "eval_samples_per_second": 309.508, | |
| "eval_steps_per_second": 6.19, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.8461538461538463, | |
| "grad_norm": 7.054930335620368, | |
| "learning_rate": 1.5789473684210526e-06, | |
| "loss": 12.9453, | |
| "mean_token_accuracy": 0.6147574186325073, | |
| "num_tokens": 30490700.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.8502024291497976, | |
| "grad_norm": 5.963169448840178, | |
| "learning_rate": 1.5384615384615387e-06, | |
| "loss": 12.7891, | |
| "mean_token_accuracy": 0.6221683025360107, | |
| "num_tokens": 30564085.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.854251012145749, | |
| "grad_norm": 6.545239654505327, | |
| "learning_rate": 1.4979757085020244e-06, | |
| "loss": 13.8984, | |
| "mean_token_accuracy": 0.6013196110725403, | |
| "num_tokens": 30629773.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.8582995951417005, | |
| "grad_norm": 7.19847490245891, | |
| "learning_rate": 1.4574898785425101e-06, | |
| "loss": 12.6953, | |
| "mean_token_accuracy": 0.6308658123016357, | |
| "num_tokens": 30685115.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.8623481781376519, | |
| "grad_norm": 6.402339501112951, | |
| "learning_rate": 1.417004048582996e-06, | |
| "loss": 14.625, | |
| "mean_token_accuracy": 0.5789289474487305, | |
| "num_tokens": 30759499.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8623481781376519, | |
| "eval_loss": 1.6737500429153442, | |
| "eval_mean_token_accuracy": 0.6036617457866669, | |
| "eval_num_tokens": 30759499.0, | |
| "eval_runtime": 0.6471, | |
| "eval_samples_per_second": 309.048, | |
| "eval_steps_per_second": 6.181, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8663967611336032, | |
| "grad_norm": 6.554938664776423, | |
| "learning_rate": 1.3765182186234818e-06, | |
| "loss": 12.0312, | |
| "mean_token_accuracy": 0.6326593160629272, | |
| "num_tokens": 30821383.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.8704453441295548, | |
| "grad_norm": 7.057225553782759, | |
| "learning_rate": 1.336032388663968e-06, | |
| "loss": 12.2266, | |
| "mean_token_accuracy": 0.6232849359512329, | |
| "num_tokens": 30881357.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.874493927125506, | |
| "grad_norm": 5.899931807473347, | |
| "learning_rate": 1.2955465587044536e-06, | |
| "loss": 12.8906, | |
| "mean_token_accuracy": 0.6196330785751343, | |
| "num_tokens": 30944702.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.8785425101214575, | |
| "grad_norm": 6.484707334873664, | |
| "learning_rate": 1.2550607287449393e-06, | |
| "loss": 12.6484, | |
| "mean_token_accuracy": 0.6225480437278748, | |
| "num_tokens": 31018431.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.882591093117409, | |
| "grad_norm": 6.294801781620055, | |
| "learning_rate": 1.214574898785425e-06, | |
| "loss": 12.8203, | |
| "mean_token_accuracy": 0.6174430847167969, | |
| "num_tokens": 31081947.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.882591093117409, | |
| "eval_loss": 1.6731250286102295, | |
| "eval_mean_token_accuracy": 0.6035991758108139, | |
| "eval_num_tokens": 31081947.0, | |
| "eval_runtime": 0.6369, | |
| "eval_samples_per_second": 314.028, | |
| "eval_steps_per_second": 6.281, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.8866396761133604, | |
| "grad_norm": 7.0152791422709795, | |
| "learning_rate": 1.174089068825911e-06, | |
| "loss": 13.6797, | |
| "mean_token_accuracy": 0.5981454849243164, | |
| "num_tokens": 31147796.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.8906882591093117, | |
| "grad_norm": 6.303822986663203, | |
| "learning_rate": 1.133603238866397e-06, | |
| "loss": 13.2266, | |
| "mean_token_accuracy": 0.6078663468360901, | |
| "num_tokens": 31212134.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 5.882937931956007, | |
| "learning_rate": 1.0931174089068828e-06, | |
| "loss": 13.8359, | |
| "mean_token_accuracy": 0.6062960624694824, | |
| "num_tokens": 31282591.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.8987854251012146, | |
| "grad_norm": 5.937096953516108, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 12.3516, | |
| "mean_token_accuracy": 0.6282551884651184, | |
| "num_tokens": 31354656.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.902834008097166, | |
| "grad_norm": 6.3577908169539255, | |
| "learning_rate": 1.0121457489878542e-06, | |
| "loss": 12.9531, | |
| "mean_token_accuracy": 0.6160109043121338, | |
| "num_tokens": 31420675.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.902834008097166, | |
| "eval_loss": 1.673437476158142, | |
| "eval_mean_token_accuracy": 0.6037902683019638, | |
| "eval_num_tokens": 31420675.0, | |
| "eval_runtime": 0.6715, | |
| "eval_samples_per_second": 297.827, | |
| "eval_steps_per_second": 5.957, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.9068825910931175, | |
| "grad_norm": 6.62892566578751, | |
| "learning_rate": 9.716599190283402e-07, | |
| "loss": 13.2188, | |
| "mean_token_accuracy": 0.6141934990882874, | |
| "num_tokens": 31486290.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.9109311740890689, | |
| "grad_norm": 6.1078260873922625, | |
| "learning_rate": 9.31174089068826e-07, | |
| "loss": 12.6641, | |
| "mean_token_accuracy": 0.6177834868431091, | |
| "num_tokens": 31557060.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.9149797570850202, | |
| "grad_norm": 6.925934956753748, | |
| "learning_rate": 8.906882591093118e-07, | |
| "loss": 13.0938, | |
| "mean_token_accuracy": 0.613021969795227, | |
| "num_tokens": 31628035.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 1.9190283400809718, | |
| "grad_norm": 5.645167673990774, | |
| "learning_rate": 8.502024291497976e-07, | |
| "loss": 13.0625, | |
| "mean_token_accuracy": 0.6135348677635193, | |
| "num_tokens": 31704450.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 6.3104167802865385, | |
| "learning_rate": 8.097165991902834e-07, | |
| "loss": 12.6016, | |
| "mean_token_accuracy": 0.6247060298919678, | |
| "num_tokens": 31771696.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "eval_loss": 1.6740624904632568, | |
| "eval_mean_token_accuracy": 0.6038688272237778, | |
| "eval_num_tokens": 31771696.0, | |
| "eval_runtime": 0.6717, | |
| "eval_samples_per_second": 297.737, | |
| "eval_steps_per_second": 5.955, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.9271255060728745, | |
| "grad_norm": 8.75546525050239, | |
| "learning_rate": 7.692307692307694e-07, | |
| "loss": 14.3125, | |
| "mean_token_accuracy": 0.6006154417991638, | |
| "num_tokens": 31833828.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.931174089068826, | |
| "grad_norm": 6.568540834125108, | |
| "learning_rate": 7.287449392712551e-07, | |
| "loss": 12.4688, | |
| "mean_token_accuracy": 0.6260876655578613, | |
| "num_tokens": 31908019.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.9352226720647774, | |
| "grad_norm": 5.992577753842725, | |
| "learning_rate": 6.882591093117409e-07, | |
| "loss": 11.6328, | |
| "mean_token_accuracy": 0.64389967918396, | |
| "num_tokens": 31968785.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.9392712550607287, | |
| "grad_norm": 6.737090705162421, | |
| "learning_rate": 6.477732793522268e-07, | |
| "loss": 12.6562, | |
| "mean_token_accuracy": 0.629744291305542, | |
| "num_tokens": 32032478.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 1.9433198380566803, | |
| "grad_norm": 6.273424964829595, | |
| "learning_rate": 6.072874493927125e-07, | |
| "loss": 12.1953, | |
| "mean_token_accuracy": 0.6316378116607666, | |
| "num_tokens": 32096023.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9433198380566803, | |
| "eval_loss": 1.6731250286102295, | |
| "eval_mean_token_accuracy": 0.6038562655448914, | |
| "eval_num_tokens": 32096023.0, | |
| "eval_runtime": 0.637, | |
| "eval_samples_per_second": 313.985, | |
| "eval_steps_per_second": 6.28, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 6.127164798465686, | |
| "learning_rate": 5.668016194331984e-07, | |
| "loss": 12.1719, | |
| "mean_token_accuracy": 0.6262784600257874, | |
| "num_tokens": 32164919.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 1.951417004048583, | |
| "grad_norm": 5.756695804667141, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 12.6875, | |
| "mean_token_accuracy": 0.6200114488601685, | |
| "num_tokens": 32233114.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.9554655870445345, | |
| "grad_norm": 6.564768636864393, | |
| "learning_rate": 4.858299595141701e-07, | |
| "loss": 13.1953, | |
| "mean_token_accuracy": 0.6086875796318054, | |
| "num_tokens": 32303117.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.9595141700404857, | |
| "grad_norm": 7.252534772707513, | |
| "learning_rate": 4.453441295546559e-07, | |
| "loss": 13.25, | |
| "mean_token_accuracy": 0.6089531779289246, | |
| "num_tokens": 32373927.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.9635627530364372, | |
| "grad_norm": 6.803603694268994, | |
| "learning_rate": 4.048582995951417e-07, | |
| "loss": 12.4609, | |
| "mean_token_accuracy": 0.624879777431488, | |
| "num_tokens": 32444699.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.9635627530364372, | |
| "eval_loss": 1.6731250286102295, | |
| "eval_mean_token_accuracy": 0.6039727181196213, | |
| "eval_num_tokens": 32444699.0, | |
| "eval_runtime": 0.6475, | |
| "eval_samples_per_second": 308.869, | |
| "eval_steps_per_second": 6.177, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.9676113360323888, | |
| "grad_norm": 6.049306687892726, | |
| "learning_rate": 3.6437246963562754e-07, | |
| "loss": 14.0625, | |
| "mean_token_accuracy": 0.593756914138794, | |
| "num_tokens": 32516907.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.97165991902834, | |
| "grad_norm": 7.270949297430324, | |
| "learning_rate": 3.238866396761134e-07, | |
| "loss": 12.6406, | |
| "mean_token_accuracy": 0.6211447715759277, | |
| "num_tokens": 32583342.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 1.9757085020242915, | |
| "grad_norm": 8.038406126817675, | |
| "learning_rate": 2.834008097165992e-07, | |
| "loss": 12.9766, | |
| "mean_token_accuracy": 0.6204248666763306, | |
| "num_tokens": 32639142.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.979757085020243, | |
| "grad_norm": 6.211130232502858, | |
| "learning_rate": 2.4291497975708504e-07, | |
| "loss": 12.1953, | |
| "mean_token_accuracy": 0.6283571124076843, | |
| "num_tokens": 32705595.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.9838056680161942, | |
| "grad_norm": 6.72573911112471, | |
| "learning_rate": 2.0242914979757086e-07, | |
| "loss": 13.8281, | |
| "mean_token_accuracy": 0.5990269184112549, | |
| "num_tokens": 32771222.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9838056680161942, | |
| "eval_loss": 1.6731250286102295, | |
| "eval_mean_token_accuracy": 0.6039326936006546, | |
| "eval_num_tokens": 32771222.0, | |
| "eval_runtime": 0.639, | |
| "eval_samples_per_second": 312.997, | |
| "eval_steps_per_second": 6.26, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9878542510121457, | |
| "grad_norm": 6.616901783015977, | |
| "learning_rate": 1.619433198380567e-07, | |
| "loss": 12.0078, | |
| "mean_token_accuracy": 0.6346266865730286, | |
| "num_tokens": 32827549.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 1.9919028340080973, | |
| "grad_norm": 6.065995562899032, | |
| "learning_rate": 1.2145748987854252e-07, | |
| "loss": 13.1484, | |
| "mean_token_accuracy": 0.6112900972366333, | |
| "num_tokens": 32900562.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.9959514170040484, | |
| "grad_norm": 6.877917660573328, | |
| "learning_rate": 8.097165991902835e-08, | |
| "loss": 13.1406, | |
| "mean_token_accuracy": 0.6131631135940552, | |
| "num_tokens": 32967282.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 6.297817735745015, | |
| "learning_rate": 4.0485829959514176e-08, | |
| "loss": 12.4141, | |
| "mean_token_accuracy": 0.6238685250282288, | |
| "num_tokens": 33035067.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 494, | |
| "total_flos": 39806562140160.0, | |
| "train_loss": 13.738502656882591, | |
| "train_runtime": 467.8481, | |
| "train_samples_per_second": 67.569, | |
| "train_steps_per_second": 1.056 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 494, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 39806562140160.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |