diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index 1ff719a..1b23ad4 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -75,7 +75,7 @@ def _train_model( ): model_trainer.train(progress=phase_progress, check_canceled=check_canceled) model_trainer.save() - train_corpus_size = model_trainer.stats.train_corpus_size + train_corpus_size = parallel_corpus.count() return train_corpus_size, float("nan") def _batch_inference( diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml index bcbe7c7..00d9517 100644 --- a/machine/jobs/settings.yaml +++ b/machine/jobs/settings.yaml @@ -8,15 +8,17 @@ default: train_params: do_train: true optim: adamw_torch - warmup_steps: 4000 + warmup_steps: 1000 per_device_train_batch_size: 16 gradient_accumulation_steps: 4 label_smoothing_factor: 0.2 group_by_length: true gradient_checkpointing: true + lr_scheduler_type: cosine + learning_rate: 0.0002 fp16: true save_strategy: no - max_steps: 20000 + max_steps: 5000 generate_params: device: 0 num_beams: 2