Skip to content
Snippets Groups Projects
Commit e553c103 authored by xuetao chen's avatar xuetao chen
Browse files

update

parent e5a3f93a
No related merge requests found
Showing
with 838 additions and 6 deletions
export MASTER_ADDR=$(hostname)
export RANK=$SLURM_PROCID
export WORLD_RANK=$SLURM_PROCID
export LOCAL_RANK=$SLURM_LOCALID
export WORLD_SIZE=$SLURM_NTASKS
export MASTER_PORT=12253 # default from torch launcher
export WANDB_START_METHOD="thread"
export NCCL_SOCKET_IFNAME="ens1f0"
import os
from datetime import timedelta
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['RANK'] = '0'
os.environ['WORLD_RANK'] = '0'
os.environ['LOCAL_RANK'] = '0'
os.environ['WORLD_SIZE'] = '2'
os.environ['WANDB_START_METHOD'] = 'thread'
os.environ['MASTER_PORT'] = '19500'
import torch.distributed as dist
dist.init_process_group(timeout=timedelta(seconds=10))
pass
srun -p GPU-8A100 --time=1:00:00 --gres=gpu:a100:2 --qos=gpu_8a100 --pty bash
\ No newline at end of file
#!/bin/bash
#SBATCH --time=01:00:00
#SBATCH -C gpu
#SBATCH -J afno
#SBATCH -o afno_backbone_finetune.out
#SBATCH -N 1
-n 2 -c 8 -p GPU-8A100 --gres=gpu:a100:2 -p GPU-8A100 --qos=gpu_8a100
config_file=./config/AFNO.yaml
config='afno_backbone_tec_ustc'
run_num='0'
export HDF5_USE_FILE_LOCKING=FALSE
export NCCL_NET_GDR_LEVEL=PHB
export MASTER_ADDR=$(hostname)
set -x
srun -u --mpi=pmi2 shifter \
bash -c "
source export_DDP_vars.sh
python train.py --enable_amp --yaml_config=$config_file --config=$config --run_num=$run_num
"
......@@ -5,4 +5,3 @@ export LOCAL_RANK=$SLURM_LOCALID
export WORLD_SIZE=$SLURM_NTASKS
export MASTER_PORT=29500 # default from torch launcher
export WANDB_START_METHOD="thread"
export NCCL_SOCKET_IFNAME="ens1f0"
multi.py 0 → 100644
from datetime import timedelta
#os.environ['MASTER_ADDR'] = 'gnode21'
#os.environ['RANK'] = '0'
#os.environ['WORLD_RANK'] = '0'
#os.environ['LOCAL_RANK'] = '0'
#os.environ['WORLD_SIZE'] = '2'
#os.environ['WANDB_START_METHOD'] = 'thread'
#os.environ['MASTER_PORT'] = '19500'
import torch.distributed as dist
import os
# 设置主进程的 IP 地址和端口号
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29501'
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '2'
dist.init_process_group(backend='nccl', timeout=timedelta(seconds=10)) # 初始化分布式训练环境
# 获取当前进程的排名和总进程数
rank = dist.get_rank()
world_size = dist.get_world_size()
print(f"Rank: {rank}, World size: {world_size}") # 在分布式训练中使用排名和总进程数
# 执行分布式训练代码
# ...
dist.destroy_process_group() # 释放资源
from datetime import timedelta
#os.environ['MASTER_ADDR'] = 'gnode21'
#os.environ['RANK'] = '0'
#os.environ['WORLD_RANK'] = '0'
#os.environ['LOCAL_RANK'] = '0'
#os.environ['WORLD_SIZE'] = '2'
#os.environ['WANDB_START_METHOD'] = 'thread'
#os.environ['MASTER_PORT'] = '19500'
import torch.distributed as dist
import os
# 设置主进程的 IP 地址和端口号
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29501'
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
dist.init_process_group(backend='nccl', timeout=timedelta(seconds=10)) # 初始化分布式训练环境
# 获取当前进程的排名和总进程数
rank = dist.get_rank()
world_size = dist.get_world_size()
print(f"Rank: {rank}, World size: {world_size}") # 在分布式训练中使用排名和总进程数
# 执行分布式训练代码
# ...
dist.destroy_process_group() # 释放资源
log_to_wandb: 'False'
lr: '0.0005'
batch_size: '4'
patch_size: '2'
depth: '6'
img_size: '[128, 256]'
max_epochs: '1500'
scheduler: CosineAnnealingLR
in_channels: '[ 0 1 2 3 4 5 6 7 8 9 10 11 12]'
out_channels: '[ 0 1 2 3 4 5 6 7 8 9 10 11 12]'
prediction_length: '100'
orography: 'False'
orography_path: None
exp_dir: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128
train_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train
valid_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test
inf_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/out_of_sample
time_means_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/time_means.npy
global_means_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_means.npy
global_stds_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_stds.npy
loss: l2
num_data_workers: '4'
dt: '1'
n_history: '0'
prediction_type: iterative
n_initial_conditions: '5'
ics_type: default
save_raw_forecasts: 'True'
save_channel: 'False'
masked_acc: 'False'
maskpath: None
perturb: 'False'
add_grid: 'False'
N_grid_channels: '0'
gridtype: sinusoidal
roll: 'False'
num_blocks: '8'
nettype: afno
width: '56'
modes: '32'
target: default
normalization: zscore
log_to_screen: 'True'
save_checkpoint: 'True'
enable_nhwc: 'False'
optimizer_type: FusedAdam
crop_size_x: None
crop_size_y: None
two_step_training: 'False'
plot_animations: 'False'
add_noise: 'False'
noise_std: '0'
epsilon_factor: '0'
world_size: '2'
global_batch_size: '8'
experiment_dir: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/0
checkpoint_path: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/0/training_checkpoints/ckpt.tar
best_checkpoint_path: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/0/training_checkpoints/best_ckpt.tar
resuming: 'False'
local_rank: '0'
enable_amp: 'True'
name: afno_backbone_tec_ustc_0
group: era5_precipafno_backbone_tec_ustc
project: ERA5_precip
entity: flowgan
N_in_channels: '13'
N_out_channels: '13'
This diff is collapsed.
log_to_wandb: 'False'
lr: '0.0005'
batch_size: '8'
patch_size: '2'
depth: '6'
img_size: '[128, 256]'
max_epochs: '1500'
scheduler: CosineAnnealingLR
in_channels: '[ 0 1 2 3 4 5 6 7 8 9 10 11 12]'
out_channels: '[ 0 1 2 3 4 5 6 7 8 9 10 11 12]'
prediction_length: '100'
orography: 'False'
orography_path: None
exp_dir: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128
train_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train
valid_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test
inf_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/out_of_sample
time_means_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/time_means.npy
global_means_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_means.npy
global_stds_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_stds.npy
loss: l2
num_data_workers: '4'
dt: '1'
n_history: '0'
prediction_type: iterative
n_initial_conditions: '5'
ics_type: default
save_raw_forecasts: 'True'
save_channel: 'False'
masked_acc: 'False'
maskpath: None
perturb: 'False'
add_grid: 'False'
N_grid_channels: '0'
gridtype: sinusoidal
roll: 'False'
num_blocks: '8'
nettype: afno
width: '56'
modes: '32'
target: default
normalization: zscore
log_to_screen: 'True'
save_checkpoint: 'True'
enable_nhwc: 'False'
optimizer_type: FusedAdam
crop_size_x: None
crop_size_y: None
two_step_training: 'False'
plot_animations: 'False'
add_noise: 'False'
noise_std: '0'
epsilon_factor: '0'
world_size: '1'
experiment_dir: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/1
checkpoint_path: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/1/training_checkpoints/ckpt.tar
best_checkpoint_path: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/1/training_checkpoints/best_ckpt.tar
resuming: 'False'
local_rank: '0'
enable_amp: 'True'
name: afno_backbone_tec_ustc_1
group: era5_precipafno_backbone_tec_ustc
project: ERA5_precip
entity: flowgan
N_in_channels: '13'
N_out_channels: '13'
2024-02-18 19:52:39,146 - root - INFO - --------------- Versions ---------------
2024-02-18 19:52:39,170 - root - INFO - git branch: b'* master'
2024-02-18 19:52:39,180 - root - INFO - git hash: b'd648a75b71d64c88af65bd3eaafde326f1aa2cf6'
2024-02-18 19:52:39,180 - root - INFO - Torch: 1.13.1+cu117
2024-02-18 19:52:39,180 - root - INFO - ----------------------------------------
2024-02-18 19:52:39,181 - root - INFO - ------------------ Configuration ------------------
2024-02-18 19:52:39,181 - root - INFO - Configuration file: /gpfs/home/ess/cxt/work/FourCastNetTEC/config/AFNO.yaml
2024-02-18 19:52:39,181 - root - INFO - Configuration name: afno_backbone_tec_ustc
2024-02-18 19:52:39,181 - root - INFO - log_to_wandb False
2024-02-18 19:52:39,181 - root - INFO - lr 0.0005
2024-02-18 19:52:39,181 - root - INFO - batch_size 8
2024-02-18 19:52:39,181 - root - INFO - patch_size 2
2024-02-18 19:52:39,181 - root - INFO - depth 6
2024-02-18 19:52:39,181 - root - INFO - img_size [128, 256]
2024-02-18 19:52:39,181 - root - INFO - max_epochs 1500
2024-02-18 19:52:39,181 - root - INFO - scheduler CosineAnnealingLR
2024-02-18 19:52:39,181 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-18 19:52:39,181 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-18 19:52:39,181 - root - INFO - prediction_length 100
2024-02-18 19:52:39,181 - root - INFO - orography False
2024-02-18 19:52:39,181 - root - INFO - orography_path None
2024-02-18 19:52:39,181 - root - INFO - exp_dir /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128
2024-02-18 19:52:39,181 - root - INFO - train_data_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train
2024-02-18 19:52:39,181 - root - INFO - valid_data_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test
2024-02-18 19:52:39,181 - root - INFO - inf_data_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/out_of_sample
2024-02-18 19:52:39,181 - root - INFO - time_means_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/time_means.npy
2024-02-18 19:52:39,181 - root - INFO - global_means_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_means.npy
2024-02-18 19:52:39,181 - root - INFO - global_stds_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_stds.npy
2024-02-18 19:52:39,181 - root - INFO - loss l2
2024-02-18 19:52:39,181 - root - INFO - num_data_workers 4
2024-02-18 19:52:39,181 - root - INFO - dt 1
2024-02-18 19:52:39,181 - root - INFO - n_history 0
2024-02-18 19:52:39,181 - root - INFO - prediction_type iterative
2024-02-18 19:52:39,181 - root - INFO - n_initial_conditions 5
2024-02-18 19:52:39,181 - root - INFO - ics_type default
2024-02-18 19:52:39,181 - root - INFO - save_raw_forecasts True
2024-02-18 19:52:39,181 - root - INFO - save_channel False
2024-02-18 19:52:39,182 - root - INFO - masked_acc False
2024-02-18 19:52:39,182 - root - INFO - maskpath None
2024-02-18 19:52:39,182 - root - INFO - perturb False
2024-02-18 19:52:39,182 - root - INFO - add_grid False
2024-02-18 19:52:39,182 - root - INFO - N_grid_channels 0
2024-02-18 19:52:39,182 - root - INFO - gridtype sinusoidal
2024-02-18 19:52:39,182 - root - INFO - roll False
2024-02-18 19:52:39,182 - root - INFO - num_blocks 8
2024-02-18 19:52:39,182 - root - INFO - nettype afno
2024-02-18 19:52:39,182 - root - INFO - width 56
2024-02-18 19:52:39,182 - root - INFO - modes 32
2024-02-18 19:52:39,182 - root - INFO - target default
2024-02-18 19:52:39,182 - root - INFO - normalization zscore
2024-02-18 19:52:39,182 - root - INFO - log_to_screen True
2024-02-18 19:52:39,182 - root - INFO - save_checkpoint True
2024-02-18 19:52:39,182 - root - INFO - enable_nhwc False
2024-02-18 19:52:39,182 - root - INFO - optimizer_type FusedAdam
2024-02-18 19:52:39,182 - root - INFO - crop_size_x None
2024-02-18 19:52:39,182 - root - INFO - crop_size_y None
2024-02-18 19:52:39,182 - root - INFO - two_step_training False
2024-02-18 19:52:39,182 - root - INFO - plot_animations False
2024-02-18 19:52:39,184 - root - INFO - add_noise False
2024-02-18 19:52:39,184 - root - INFO - noise_std 0
2024-02-18 19:52:39,184 - root - INFO - epsilon_factor 0
2024-02-18 19:52:39,184 - root - INFO - world_size 1
2024-02-18 19:52:39,184 - root - INFO - experiment_dir /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/1
2024-02-18 19:52:39,185 - root - INFO - checkpoint_path /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/1/training_checkpoints/ckpt.tar
2024-02-18 19:52:39,185 - root - INFO - best_checkpoint_path /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/1/training_checkpoints/best_ckpt.tar
2024-02-18 19:52:39,185 - root - INFO - resuming False
2024-02-18 19:52:39,185 - root - INFO - local_rank 0
2024-02-18 19:52:39,185 - root - INFO - enable_amp True
2024-02-18 19:52:39,185 - root - INFO - name afno_backbone_tec_ustc_1
2024-02-18 19:52:39,185 - root - INFO - group era5_precipafno_backbone_tec_ustc
2024-02-18 19:52:39,185 - root - INFO - project ERA5_precip
2024-02-18 19:52:39,185 - root - INFO - entity flowgan
2024-02-18 19:52:39,185 - root - INFO - ---------------------------------------------------
2024-02-18 19:52:39,191 - root - INFO - rank 0, begin data loader init
2024-02-18 19:52:39,210 - root - INFO - Getting file stats from /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train/2005.h5
2024-02-18 19:52:39,212 - root - INFO - Number of samples per year: 336
2024-02-18 19:52:39,212 - root - INFO - Found data at path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train. Number of examples: 2352. Image Shape: 128 x 256 x 13
2024-02-18 19:52:39,212 - root - INFO - Delta t: 6 hours
2024-02-18 19:52:39,212 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-18 19:52:39,219 - root - INFO - Getting file stats from /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test/2008.h5
2024-02-18 19:52:39,219 - root - INFO - Number of samples per year: 337
2024-02-18 19:52:39,219 - root - INFO - Found data at path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test. Number of examples: 674. Image Shape: 128 x 256 x 13
2024-02-18 19:52:39,220 - root - INFO - Delta t: 6 hours
2024-02-18 19:52:39,220 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-18 19:52:39,220 - root - INFO - rank 0, data loader initialized
2024-02-18 19:52:46,441 - root - INFO - Number of trainable model parameters: 36514560
2024-02-18 19:52:46,441 - root - INFO - Starting Training Loop...
log_to_wandb: 'False'
lr: '0.0005'
batch_size: '8'
patch_size: '2'
depth: '6'
img_size: '[128, 256]'
max_epochs: '1500'
scheduler: CosineAnnealingLR
in_channels: '[ 0 1 2 3 4 5 6 7 8 9 10 11 12]'
out_channels: '[ 0 1 2 3 4 5 6 7 8 9 10 11 12]'
prediction_length: '100'
orography: 'False'
orography_path: None
exp_dir: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128
train_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train
valid_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test
inf_data_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/out_of_sample
time_means_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/time_means.npy
global_means_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_means.npy
global_stds_path: /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_stds.npy
loss: l2
num_data_workers: '4'
dt: '1'
n_history: '0'
prediction_type: iterative
n_initial_conditions: '5'
ics_type: default
save_raw_forecasts: 'True'
save_channel: 'False'
masked_acc: 'False'
maskpath: None
perturb: 'False'
add_grid: 'False'
N_grid_channels: '0'
gridtype: sinusoidal
roll: 'False'
num_blocks: '8'
nettype: afno
width: '56'
modes: '32'
target: default
normalization: zscore
log_to_screen: 'True'
save_checkpoint: 'True'
enable_nhwc: 'False'
optimizer_type: FusedAdam
crop_size_x: None
crop_size_y: None
two_step_training: 'False'
plot_animations: 'False'
add_noise: 'False'
noise_std: '0'
epsilon_factor: '0'
world_size: '1'
experiment_dir: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/d6p2
checkpoint_path: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/d6p2/training_checkpoints/ckpt.tar
best_checkpoint_path: /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/d6p2/training_checkpoints/best_ckpt.tar
resuming: 'True'
local_rank: '0'
enable_amp: 'True'
name: afno_backbone_tec_ustc_d6p2
group: era5_precipafno_backbone_tec_ustc
project: ERA5_precip
entity: flowgan
N_in_channels: '13'
N_out_channels: '13'
2024-02-18 20:36:31,110 - root - INFO - --------------- Versions ---------------
2024-02-18 20:36:31,132 - root - INFO - git branch: b'* master'
2024-02-18 20:36:31,146 - root - INFO - git hash: b'c34cbdfeff02190cf62bcbe7c2cb1a0630fa01b5'
2024-02-18 20:36:31,146 - root - INFO - Torch: 1.13.1+cu117
2024-02-18 20:36:31,146 - root - INFO - ----------------------------------------
2024-02-18 20:36:31,146 - root - INFO - ------------------ Configuration ------------------
2024-02-18 20:36:31,146 - root - INFO - Configuration file: /gpfs/home/ess/cxt/work/FourCastNetTEC/config/AFNO.yaml
2024-02-18 20:36:31,146 - root - INFO - Configuration name: afno_backbone_tec_ustc
2024-02-18 20:36:31,146 - root - INFO - log_to_wandb False
2024-02-18 20:36:31,146 - root - INFO - lr 0.0005
2024-02-18 20:36:31,146 - root - INFO - batch_size 8
2024-02-18 20:36:31,146 - root - INFO - patch_size 2
2024-02-18 20:36:31,146 - root - INFO - depth 6
2024-02-18 20:36:31,146 - root - INFO - img_size [128, 256]
2024-02-18 20:36:31,146 - root - INFO - max_epochs 1500
2024-02-18 20:36:31,146 - root - INFO - scheduler CosineAnnealingLR
2024-02-18 20:36:31,146 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-18 20:36:31,147 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-18 20:36:31,147 - root - INFO - prediction_length 100
2024-02-18 20:36:31,147 - root - INFO - orography False
2024-02-18 20:36:31,147 - root - INFO - orography_path None
2024-02-18 20:36:31,147 - root - INFO - exp_dir /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128
2024-02-18 20:36:31,147 - root - INFO - train_data_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train
2024-02-18 20:36:31,147 - root - INFO - valid_data_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test
2024-02-18 20:36:31,147 - root - INFO - inf_data_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/out_of_sample
2024-02-18 20:36:31,147 - root - INFO - time_means_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/time_means.npy
2024-02-18 20:36:31,147 - root - INFO - global_means_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_means.npy
2024-02-18 20:36:31,147 - root - INFO - global_stds_path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/global_stds.npy
2024-02-18 20:36:31,147 - root - INFO - loss l2
2024-02-18 20:36:31,147 - root - INFO - num_data_workers 4
2024-02-18 20:36:31,147 - root - INFO - dt 1
2024-02-18 20:36:31,147 - root - INFO - n_history 0
2024-02-18 20:36:31,147 - root - INFO - prediction_type iterative
2024-02-18 20:36:31,147 - root - INFO - n_initial_conditions 5
2024-02-18 20:36:31,147 - root - INFO - ics_type default
2024-02-18 20:36:31,147 - root - INFO - save_raw_forecasts True
2024-02-18 20:36:31,147 - root - INFO - save_channel False
2024-02-18 20:36:31,147 - root - INFO - masked_acc False
2024-02-18 20:36:31,147 - root - INFO - maskpath None
2024-02-18 20:36:31,147 - root - INFO - perturb False
2024-02-18 20:36:31,147 - root - INFO - add_grid False
2024-02-18 20:36:31,147 - root - INFO - N_grid_channels 0
2024-02-18 20:36:31,147 - root - INFO - gridtype sinusoidal
2024-02-18 20:36:31,147 - root - INFO - roll False
2024-02-18 20:36:31,147 - root - INFO - num_blocks 8
2024-02-18 20:36:31,147 - root - INFO - nettype afno
2024-02-18 20:36:31,147 - root - INFO - width 56
2024-02-18 20:36:31,147 - root - INFO - modes 32
2024-02-18 20:36:31,147 - root - INFO - target default
2024-02-18 20:36:31,147 - root - INFO - normalization zscore
2024-02-18 20:36:31,147 - root - INFO - log_to_screen True
2024-02-18 20:36:31,148 - root - INFO - save_checkpoint True
2024-02-18 20:36:31,148 - root - INFO - enable_nhwc False
2024-02-18 20:36:31,148 - root - INFO - optimizer_type FusedAdam
2024-02-18 20:36:31,148 - root - INFO - crop_size_x None
2024-02-18 20:36:31,148 - root - INFO - crop_size_y None
2024-02-18 20:36:31,148 - root - INFO - two_step_training False
2024-02-18 20:36:31,148 - root - INFO - plot_animations False
2024-02-18 20:36:31,149 - root - INFO - add_noise False
2024-02-18 20:36:31,149 - root - INFO - noise_std 0
2024-02-18 20:36:31,149 - root - INFO - epsilon_factor 0
2024-02-18 20:36:31,149 - root - INFO - world_size 1
2024-02-18 20:36:31,149 - root - INFO - experiment_dir /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/d6p2
2024-02-18 20:36:31,149 - root - INFO - checkpoint_path /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/d6p2/training_checkpoints/ckpt.tar
2024-02-18 20:36:31,150 - root - INFO - best_checkpoint_path /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/d6p2/training_checkpoints/best_ckpt.tar
2024-02-18 20:36:31,150 - root - INFO - resuming True
2024-02-18 20:36:31,150 - root - INFO - local_rank 0
2024-02-18 20:36:31,150 - root - INFO - enable_amp True
2024-02-18 20:36:31,150 - root - INFO - name afno_backbone_tec_ustc_d6p2
2024-02-18 20:36:31,150 - root - INFO - group era5_precipafno_backbone_tec_ustc
2024-02-18 20:36:31,150 - root - INFO - project ERA5_precip
2024-02-18 20:36:31,150 - root - INFO - entity flowgan
2024-02-18 20:36:31,150 - root - INFO - ---------------------------------------------------
2024-02-18 20:36:31,156 - root - INFO - rank 0, begin data loader init
2024-02-18 20:36:31,165 - root - INFO - Getting file stats from /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train/2005.h5
2024-02-18 20:36:31,166 - root - INFO - Number of samples per year: 336
2024-02-18 20:36:31,166 - root - INFO - Found data at path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/train. Number of examples: 2352. Image Shape: 128 x 256 x 13
2024-02-18 20:36:31,166 - root - INFO - Delta t: 6 hours
2024-02-18 20:36:31,166 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-18 20:36:31,179 - root - INFO - Getting file stats from /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test/2008.h5
2024-02-18 20:36:31,179 - root - INFO - Number of samples per year: 337
2024-02-18 20:36:31,179 - root - INFO - Found data at path /home/ess/cxt/work/FourCastNetTEC/Data_TEC_ustc_128/test. Number of examples: 674. Image Shape: 128 x 256 x 13
2024-02-18 20:36:31,179 - root - INFO - Delta t: 6 hours
2024-02-18 20:36:31,179 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-18 20:36:31,179 - root - INFO - rank 0, data loader initialized
2024-02-18 20:36:33,751 - root - INFO - Loading checkpoint /home/ess/cxt/work/FourCastNetTEC/results/tec_ustc_128/afno_backbone_tec_ustc/d6p2/training_checkpoints/ckpt.tar
2024-02-18 20:36:34,805 - root - INFO - Number of trainable model parameters: 36514560
2024-02-18 20:36:34,805 - root - INFO - Starting Training Loop...
srun -u --mpi=pmi2 --nodes=1 --ntasks-per-node=2 --cpus-per-task=8 -p GPU-8A100 --time=1:00:00 --gres=gpu:a100:2 --qos=gpu_8a100 \
bash -c "
/home/ess/cxt/miniconda3/bin/conda init bash
source export_DDP_vars.sh
conda activate pytorch
python train.py --enable_amp --yaml_config=./config/AFNO.yaml --config=afno_backbone_tec_ustc --run_num=1
"
srun -u --mpi=pmi2 --nodes=1 --ntasks-per-node=2 --cpus-per-task=8 -p GPU-8A100 --time=1:00:00 --gres=gpu:a100:2 --qos=gpu_8a100 \
bash -c "
/home/ess/cxt/miniconda3/bin/conda init
source export_DDP_vars.sh
conda activate pytorch
python train.py --enable_amp --yaml_config=./config/AFNO.yaml --config=afno_backbone_tec_ustc --run_num=1
"
srun --nodes=1 --ntasks-per-node=2 --cpus-per-task=8 -p GPU-8A100 --time=1:00:00 --gres=gpu:a100:2 --qos=gpu_8a100 --pty bash
srun --cpus-per-task=8 -p GPU-8A100 --time=1:00:00 --gres=gpu:a100:2 --qos=gpu_8a100 --pty bash
#!/bin/bash -l
#SBATCH --time=01:00:00
#SBATCH -C gpu
#SBATCH -J afno
#SBATCH -o afno_backbone_finetune.out
#SBATCH -N 1 -n 2 -c 8
#SBATCH --gres=gpu:v100:2 -p GPU-8A100 --qos=gpu_8a100
#SBATCH -N 1 -n 2 -c 8 --gres=gpu:a100:2 -p GPU-8A100 --qos=gpu_8a100
config_file=./config/AFNO.yaml
config='afno_backbone_tec_ustc'
......@@ -16,7 +14,7 @@ export NCCL_NET_GDR_LEVEL=PHB
export MASTER_ADDR=$(hostname)
set -x
srun -u --mpi=pmi2 shifter \
srun -u --mpi=pmi2 \
bash -c "
source export_DDP_vars.sh
python train.py --enable_amp --yaml_config=$config_file --config=$config --run_num=$run_num
......
......@@ -45,6 +45,11 @@
#Animashree Anandkumar - California Institute of Technology, NVIDIA Corporation
import os
os.environ["NCCL_IB_TC"] = "128"
os.environ["NCCL_IB_GID_INDEX"] = "3"
os.environ["NCCL_IB_TIMEOUT"] = "22"
os.environ["NCCL_SOCKET_IFNAME"] = "eth0"
import time
from datetime import timedelta
import numpy as np
......@@ -621,4 +626,4 @@ if __name__ == '__main__':
trainer = Trainer(params, world_rank)
trainer.train()
logging.info('DONE ---- rank %d'%world_rank)
\ No newline at end of file
logging.info('DONE ---- rank %d'%world_rank)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment