{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06872852233676977, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 1.6542620658874512, "epoch": 0.003436426116838488, "frac_reward_zero_std": 0.0, "grad_norm": 1.1348820924758911, "kl": 0.0, "learning_rate": 5e-06, "loss": -0.046760596334934235, "num_tokens": 1705.0, "reward": 0.5199999809265137, "reward_std": 0.9467136263847351, "rewards/linear_reward/mean": 0.5199999809265137, "rewards/linear_reward/std": 0.9467136263847351, "step": 1, "step_time": 4.811463406018447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 60.333335876464844, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 6.167017459869385, "epoch": 0.006872852233676976, "frac_reward_zero_std": 0.0, "grad_norm": 0.5547046065330505, "kl": 0.0006387412431649864, "learning_rate": 4.982817869415808e-06, "loss": -0.2076236754655838, "num_tokens": 4946.0, "reward": 0.35850000381469727, "reward_std": 0.8958175182342529, "rewards/linear_reward/mean": 0.35850000381469727, "rewards/linear_reward/std": 0.8958175182342529, "step": 2, "step_time": 10.362723789003212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 99.5, "completions/mean_terminated_length": 47.333335876464844, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 5.633602619171143, "epoch": 0.010309278350515464, "frac_reward_zero_std": 0.0, "grad_norm": 0.4426007568836212, "kl": 0.0013198162196204066, "learning_rate": 4.9656357388316155e-06, "loss": -0.05219532549381256, "num_tokens": 6884.0, "reward": 0.03999999165534973, "reward_std": 0.8073000311851501, "rewards/linear_reward/mean": 0.03999999165534973, "rewards/linear_reward/std": 0.8072999715805054, "step": 3, "step_time": 10.486546409025323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 1.187355637550354, "epoch": 0.013745704467353952, "frac_reward_zero_std": 1.0, "grad_norm": 0.003932479303330183, "kl": 0.0007797930738888681, "learning_rate": 4.948453608247423e-06, "loss": 3.898965587723069e-05, "num_tokens": 8586.0, "reward": 1.0, "reward_std": 0.0, "rewards/linear_reward/mean": 1.0, "rewards/linear_reward/std": 0.0, "step": 4, "step_time": 2.7394606139860116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 1.9143180847167969, "epoch": 0.01718213058419244, "frac_reward_zero_std": 0.0, "grad_norm": 0.8697563409805298, "kl": 0.0010849714744836092, "learning_rate": 4.9312714776632305e-06, "loss": -0.02122233435511589, "num_tokens": 10357.0, "reward": 0.19999998807907104, "reward_std": 0.9055384993553162, "rewards/linear_reward/mean": 0.19999998807907104, "rewards/linear_reward/std": 0.9055385589599609, "step": 5, "step_time": 2.969696521991864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 98.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 6.860729694366455, "epoch": 0.020618556701030927, "frac_reward_zero_std": 0.0, "grad_norm": 0.30146193504333496, "kl": 0.0006769314641132951, "learning_rate": 4.914089347079038e-06, "loss": -0.35087525844573975, "num_tokens": 13503.0, "reward": -0.5, "reward_std": 0.5773502588272095, "rewards/linear_reward/mean": -0.5, "rewards/linear_reward/std": 0.5773502588272095, "step": 6, "step_time": 10.322425201011356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 109.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 5.4356794357299805, "epoch": 0.024054982817869417, "frac_reward_zero_std": 0.0, "grad_norm": 0.3599326014518738, "kl": 0.0008105345768854022, "learning_rate": 4.8969072164948455e-06, "loss": 0.3170129954814911, "num_tokens": 17215.0, "reward": 0.699999988079071, "reward_std": 0.4690415561199188, "rewards/linear_reward/mean": 0.699999988079071, "rewards/linear_reward/std": 0.4690415859222412, "step": 7, "step_time": 10.544834155996796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 63.5, "completions/mean_terminated_length": 63.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 1.924232006072998, "epoch": 0.027491408934707903, "frac_reward_zero_std": 0.0, "grad_norm": 0.5054089426994324, "kl": 0.0007124484400264919, "learning_rate": 4.879725085910653e-06, "loss": 0.14767341315746307, "num_tokens": 19581.0, "reward": 0.75, "reward_std": 0.5, "rewards/linear_reward/mean": 0.75, "rewards/linear_reward/std": 0.5, "step": 8, "step_time": 4.267783374991268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 88.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 6.544665813446045, "epoch": 0.030927835051546393, "frac_reward_zero_std": 0.0, "grad_norm": 0.22817742824554443, "kl": 0.0006943146581761539, "learning_rate": 4.8625429553264606e-06, "loss": 0.19407318532466888, "num_tokens": 23377.0, "reward": 0.375, "reward_std": 0.4349329471588135, "rewards/linear_reward/mean": 0.375, "rewards/linear_reward/std": 0.4349329471588135, "step": 9, "step_time": 10.37712084496161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 85.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 8.20298957824707, "epoch": 0.03436426116838488, "frac_reward_zero_std": 0.0, "grad_norm": 0.14684802293777466, "kl": 0.0005284421495161951, "learning_rate": 4.845360824742268e-06, "loss": 0.13534295558929443, "num_tokens": 27506.0, "reward": 0.22499999403953552, "reward_std": 0.44999998807907104, "rewards/linear_reward/mean": 0.22499999403953552, "rewards/linear_reward/std": 0.44999995827674866, "step": 10, "step_time": 10.44249604898505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 70.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 5.8667192459106445, "epoch": 0.037800687285223365, "frac_reward_zero_std": 0.0, "grad_norm": 0.2726059556007385, "kl": 0.0008200935553759336, "learning_rate": 4.8281786941580765e-06, "loss": 0.14489078521728516, "num_tokens": 30084.0, "reward": 0.25, "reward_std": 0.5, "rewards/linear_reward/mean": 0.25, "rewards/linear_reward/std": 0.5, "step": 11, "step_time": 10.283689843025059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 64.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 7.840090274810791, "epoch": 0.041237113402061855, "frac_reward_zero_std": 0.0, "grad_norm": 0.16126857697963715, "kl": 0.0006341434200294316, "learning_rate": 4.810996563573884e-06, "loss": 0.14768795669078827, "num_tokens": 34000.0, "reward": 0.22499999403953552, "reward_std": 0.44999998807907104, "rewards/linear_reward/mean": 0.22499999403953552, "rewards/linear_reward/std": 0.44999995827674866, "step": 12, "step_time": 10.546107036992908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 96.75, "completions/mean_terminated_length": 43.66666793823242, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 6.812761306762695, "epoch": 0.044673539518900345, "frac_reward_zero_std": 0.0, "grad_norm": 0.368470698595047, "kl": 0.0006157478783279657, "learning_rate": 4.7938144329896915e-06, "loss": 0.3335062563419342, "num_tokens": 36871.0, "reward": 0.5849999785423279, "reward_std": 0.49889877438545227, "rewards/linear_reward/mean": 0.5849999785423279, "rewards/linear_reward/std": 0.49889877438545227, "step": 13, "step_time": 10.325460208987352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 101.0, "completions/mean_terminated_length": 49.333335876464844, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 6.1570210456848145, "epoch": 0.048109965635738834, "frac_reward_zero_std": 0.0, "grad_norm": 0.32078948616981506, "kl": 0.0005901199183426797, "learning_rate": 4.776632302405499e-06, "loss": -0.2796734869480133, "num_tokens": 39495.0, "reward": -0.5, "reward_std": 0.5773502588272095, "rewards/linear_reward/mean": -0.5, "rewards/linear_reward/std": 0.5773502588272095, "step": 14, "step_time": 10.405442989023868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 6.912732124328613, "epoch": 0.05154639175257732, "frac_reward_zero_std": 0.0, "grad_norm": 0.26652705669403076, "kl": 0.000629998161457479, "learning_rate": 4.759450171821306e-06, "loss": 0.24821330606937408, "num_tokens": 43431.0, "reward": 0.44999998807907104, "reward_std": 0.5196152329444885, "rewards/linear_reward/mean": 0.44999998807907104, "rewards/linear_reward/std": 0.5196152329444885, "step": 15, "step_time": 10.592712028999813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 85.5, "completions/mean_terminated_length": 85.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 4.354438781738281, "epoch": 0.054982817869415807, "frac_reward_zero_std": 0.0, "grad_norm": 0.3930811583995819, "kl": 0.0008360631763935089, "learning_rate": 4.742268041237113e-06, "loss": 0.17124062776565552, "num_tokens": 47049.0, "reward": 0.4749999940395355, "reward_std": 0.5499999523162842, "rewards/linear_reward/mean": 0.4749999940395355, "rewards/linear_reward/std": 0.550000011920929, "step": 16, "step_time": 7.009384323027916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.2932045459747314, "epoch": 0.058419243986254296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00355098326690495, "kl": 0.0005276052397675812, "learning_rate": 4.725085910652922e-06, "loss": 2.6380263079772703e-05, "num_tokens": 48803.0, "reward": 1.0, "reward_std": 0.0, "rewards/linear_reward/mean": 1.0, "rewards/linear_reward/std": 0.0, "step": 17, "step_time": 2.6762739830301143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 57.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 5.419597148895264, "epoch": 0.061855670103092786, "frac_reward_zero_std": 0.0, "grad_norm": 0.4502696990966797, "kl": 0.0007034471491351724, "learning_rate": 4.707903780068729e-06, "loss": 0.349566787481308, "num_tokens": 50854.0, "reward": 0.75, "reward_std": 0.5, "rewards/linear_reward/mean": 0.75, "rewards/linear_reward/std": 0.5, "step": 18, "step_time": 10.25336693302961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 1.7835241556167603, "epoch": 0.06529209621993128, "frac_reward_zero_std": 0.0, "grad_norm": 0.39402562379837036, "kl": 0.0008214744739234447, "learning_rate": 4.690721649484537e-06, "loss": -0.04086802154779434, "num_tokens": 52566.0, "reward": 0.75, "reward_std": 0.5, "rewards/linear_reward/mean": 0.75, "rewards/linear_reward/std": 0.5, "step": 19, "step_time": 3.4280499030137435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 1.4242854118347168, "epoch": 0.06872852233676977, "frac_reward_zero_std": 1.0, "grad_norm": 0.005312944296747446, "kl": 0.0007797326543368399, "learning_rate": 4.673539518900344e-06, "loss": 3.8986632716841996e-05, "num_tokens": 54345.0, "reward": 1.0, "reward_std": 0.0, "rewards/linear_reward/mean": 1.0, "rewards/linear_reward/std": 0.0, "step": 20, "step_time": 2.476823721022811 } ], "logging_steps": 1, "max_steps": 291, "num_input_tokens_seen": 54345, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }