123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- # coding=utf-8
- import os
- import logging
- from transformers import AdamW, get_linear_schedule_with_warmup
- import torch
- logger = logging.getLogger(__name__)
- def build_optimizer_and_scheduler(args, model, t_total):
- module = (
- model.module if hasattr(model, "module") else model
- )
- # 差分学习率
- no_decay = ["bias", "LayerNorm.weight"]
- model_param = list(module.named_parameters())
- bert_param_optimizer = []
- crf_param_optimizer = []
- other_param_optimizer = []
- for name, para in model_param:
- space = name.split('.')
- # print(name)
- if space[0] == 'bert_module':
- bert_param_optimizer.append((name, para))
- elif space[0] == 'crf':
- crf_param_optimizer.append((name, para))
- else:
- other_param_optimizer.append((name, para))
- optimizer_grouped_parameters = [
- # bert other module
- {"params": [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)],
- "weight_decay": args.weight_decay, 'lr': args.lr},
- {"params": [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)],
- "weight_decay": 0.0, 'lr': args.lr},
- # crf模块
- {"params": [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)],
- "weight_decay": args.weight_decay, 'lr': args.crf_lr},
- {"params": [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)],
- "weight_decay": 0.0, 'lr': args.other_lr},
- # 其他模块,差分学习率
- {"params": [p for n, p in other_param_optimizer if not any(nd in n for nd in no_decay)],
- "weight_decay": args.weight_decay, 'lr': args.other_lr},
- {"params": [p for n, p in other_param_optimizer if any(nd in n for nd in no_decay)],
- "weight_decay": 0.0, 'lr': args.other_lr},
- ]
- optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon)
- scheduler = get_linear_schedule_with_warmup(
- optimizer, num_warmup_steps=int(args.warmup_proportion * t_total), num_training_steps=t_total
- )
- return optimizer, scheduler
- def save_model(args, model, model_name):
- """保存最好的验证集效果最好那个模型"""
- output_dir = os.path.join(args.output_dir, '{}'.format(model_name))
- if not os.path.exists(output_dir):
- os.makedirs(output_dir, exist_ok=True)
- # take care of model distributed / parallel training
- model_to_save = (
- model.module if hasattr(model, "module") else model
- )
- logger.info('Saving model checkpoint to {}'.format(output_dir))
- torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'model.pt'))
- def save_model_step(args, model, global_step):
- """根据global_step来保存模型"""
- output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
- if not os.path.exists(output_dir):
- os.makedirs(output_dir, exist_ok=True)
- # take care of model distributed / parallel training
- model_to_save = (
- model.module if hasattr(model, "module") else model
- )
- logger.info('Saving model & optimizer & scheduler checkpoint to {}.format(output_dir)')
- torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'model.pt'))
- def load_model_and_parallel(model, gpu_ids, ckpt_path=None, strict=True):
- """
- 加载模型 & 放置到 GPU 中(单卡 / 多卡)
- """
- gpu_ids = gpu_ids.split(',')
- # set to device to the first cuda
- device = torch.device("cpu" if gpu_ids[0] == '-1' else "cuda:" + gpu_ids[0])
- if ckpt_path is not None:
- logger.info('Load ckpt from {}'.format(ckpt_path))
- model.load_state_dict(torch.load(ckpt_path, map_location=torch.device('cpu')), strict=strict)
- model.to(device)
- if len(gpu_ids) > 1:
- logger.info('Use multi gpus in: {}'.format(gpu_ids))
- gpu_ids = [int(x) for x in gpu_ids]
- model = torch.nn.DataParallel(model, device_ids=gpu_ids)
- else:
- logger.info('Use single gpu in: {}'.format(gpu_ids))
- return model, device
|