Skip to content
Snippets Groups Projects
Commit d858e370 authored by xuetaowave's avatar xuetaowave
Browse files

update

parent 1fa9d270
No related merge requests found
......@@ -2,6 +2,7 @@
full_field: &FULL_FIELD
loss: 'l2'
lr: 1E-3
retrain: !!bool False
scheduler: 'ReduceLROnPlateau'
num_data_workers: 4
dt: 1 # how many timesteps ahead the model will predict
......
......@@ -3,6 +3,7 @@ full_field: &FULL_FIELD
loss: 'l2'
lr: 1E-3
scheduler: 'ReduceLROnPlateau'
retrain: !!bool False
num_data_workers: 4
dt: 1 # how many timesteps ahead the model will predict
n_history: 0 #how many previous timesteps to consider
......
......@@ -19,6 +19,7 @@ time_means_path: ./time_means.npy
global_means_path: ./global_means.npy
global_stds_path: ./global_stds.npy
loss: l2
retrain: 'False'
num_data_workers: '4'
dt: '1'
n_history: '0'
......@@ -57,7 +58,7 @@ experiment_dir:
checkpoint_path: ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
best_checkpoint_path:
./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/best_ckpt.tar
resuming: 'False'
resuming: 'True'
local_rank: '0'
enable_amp: 'True'
name: afno_backbone_ljkj_d6p_test1
......
......@@ -178,3 +178,626 @@
2024-02-19 21:17:23,055 - root - INFO - Time taken for epoch 2 is 59.7992959022522 sec
2024-02-19 21:17:23,055 - root - INFO - Train loss: 0.2991207540035248. Valid loss: 0.2673646807670593
2024-02-19 21:17:23,093 - root - INFO - Memory Used: 12272.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:18:22,467 - root - INFO - Time taken for epoch 3 is 59.374216079711914 sec
2024-02-19 21:18:22,468 - root - INFO - Train loss: 0.2890133857727051. Valid loss: 0.2473258525133133
2024-02-19 21:18:22,497 - root - INFO - Memory Used: 12069.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:19:21,076 - root - INFO - Time taken for epoch 4 is 58.57922625541687 sec
2024-02-19 21:19:21,077 - root - INFO - Train loss: 0.27804723381996155. Valid loss: 0.2527182102203369
2024-02-19 21:19:21,112 - root - INFO - Memory Used: 12058.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:20:20,030 - root - INFO - Time taken for epoch 5 is 58.91758060455322 sec
2024-02-19 21:20:20,030 - root - INFO - Train loss: 0.24430719017982483. Valid loss: 0.22843459248542786
2024-02-19 21:20:20,063 - root - INFO - Memory Used: 12127.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:21:19,566 - root - INFO - Time taken for epoch 6 is 59.50251913070679 sec
2024-02-19 21:21:19,566 - root - INFO - Train loss: 0.2356795072555542. Valid loss: 0.2140081375837326
2024-02-19 21:21:19,599 - root - INFO - Memory Used: 11846.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:22:18,692 - root - INFO - Time taken for epoch 7 is 59.09311866760254 sec
2024-02-19 21:22:18,692 - root - INFO - Train loss: 0.24076075851917267. Valid loss: 0.22290067374706268
2024-02-19 21:22:18,723 - root - INFO - Memory Used: 11986.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:23:18,163 - root - INFO - Time taken for epoch 8 is 59.44004440307617 sec
2024-02-19 21:23:18,163 - root - INFO - Train loss: 0.23235246539115906. Valid loss: 0.21175424754619598
2024-02-19 21:23:18,199 - root - INFO - Memory Used: 12017.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:24:17,062 - root - INFO - Time taken for epoch 9 is 58.863529682159424 sec
2024-02-19 21:24:17,063 - root - INFO - Train loss: 0.23020009696483612. Valid loss: 0.21190962195396423
2024-02-19 21:24:17,152 - root - INFO - Memory Used: 12186.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:25:16,081 - root - INFO - Time taken for epoch 10 is 58.928956270217896 sec
2024-02-19 21:25:16,082 - root - INFO - Train loss: 0.2270565778017044. Valid loss: 0.20935162901878357
2024-02-19 21:25:16,122 - root - INFO - Memory Used: 12176.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:26:14,648 - root - INFO - Time taken for epoch 11 is 58.525614976882935 sec
2024-02-19 21:26:14,648 - root - INFO - Train loss: 0.24752986431121826. Valid loss: 0.21659743785858154
2024-02-19 21:26:14,681 - root - INFO - Memory Used: 12170.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:27:13,519 - root - INFO - Time taken for epoch 12 is 58.83862257003784 sec
2024-02-19 21:27:13,520 - root - INFO - Train loss: 0.2231183499097824. Valid loss: 0.20745661854743958
2024-02-19 21:27:13,552 - root - INFO - Memory Used: 12177.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:28:12,415 - root - INFO - Time taken for epoch 13 is 58.862244844436646 sec
2024-02-19 21:28:12,415 - root - INFO - Train loss: 0.22065123915672302. Valid loss: 0.20001713931560516
2024-02-19 21:28:12,448 - root - INFO - Memory Used: 12176.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:29:11,303 - root - INFO - Time taken for epoch 14 is 58.85520100593567 sec
2024-02-19 21:29:11,303 - root - INFO - Train loss: 0.22018755972385406. Valid loss: 0.1985495686531067
2024-02-19 21:29:11,346 - root - INFO - Memory Used: 12176.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:30:09,813 - root - INFO - Time taken for epoch 15 is 58.466259479522705 sec
2024-02-19 21:30:09,813 - root - INFO - Train loss: 0.21955715119838715. Valid loss: 0.1994101107120514
2024-02-19 21:30:09,845 - root - INFO - Memory Used: 12170.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:31:08,680 - root - INFO - Time taken for epoch 16 is 58.834797382354736 sec
2024-02-19 21:31:08,680 - root - INFO - Train loss: 0.21635444462299347. Valid loss: 0.19454137980937958
2024-02-19 21:31:08,708 - root - INFO - Memory Used: 12170.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:32:07,577 - root - INFO - Time taken for epoch 17 is 58.86822152137756 sec
2024-02-19 21:32:07,577 - root - INFO - Train loss: 0.21468757092952728. Valid loss: 0.19264116883277893
2024-02-19 21:32:07,622 - root - INFO - Memory Used: 12162.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:33:06,013 - root - INFO - Time taken for epoch 18 is 58.39062809944153 sec
2024-02-19 21:33:06,013 - root - INFO - Train loss: 0.21267618238925934. Valid loss: 0.19356952607631683
2024-02-19 21:33:06,047 - root - INFO - Memory Used: 12161.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:34:04,197 - root - INFO - Time taken for epoch 19 is 58.149407148361206 sec
2024-02-19 21:34:04,197 - root - INFO - Train loss: 0.210763618350029. Valid loss: 0.19327907264232635
2024-02-19 21:34:04,237 - root - INFO - Memory Used: 12162.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:35:02,320 - root - INFO - Time taken for epoch 20 is 58.08303165435791 sec
2024-02-19 21:35:02,321 - root - INFO - Train loss: 0.2218296229839325. Valid loss: 0.1950882226228714
2024-02-19 21:35:02,363 - root - INFO - Memory Used: 12168.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:36:00,335 - root - INFO - Time taken for epoch 21 is 57.97176480293274 sec
2024-02-19 21:36:00,335 - root - INFO - Train loss: 0.21751902997493744. Valid loss: 0.19366368651390076
2024-02-19 21:36:00,371 - root - INFO - Memory Used: 12168.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:36:58,337 - root - INFO - Time taken for epoch 22 is 57.96571636199951 sec
2024-02-19 21:36:58,338 - root - INFO - Train loss: 0.21502703428268433. Valid loss: 0.1928132176399231
2024-02-19 21:36:58,381 - root - INFO - Memory Used: 12168.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:37:56,796 - root - INFO - Time taken for epoch 23 is 58.4150128364563 sec
2024-02-19 21:37:56,796 - root - INFO - Train loss: 0.21450507640838623. Valid loss: 0.19170430302619934
2024-02-19 21:37:56,828 - root - INFO - Memory Used: 12248.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:38:55,456 - root - INFO - Time taken for epoch 24 is 58.62744641304016 sec
2024-02-19 21:38:55,456 - root - INFO - Train loss: 0.21105284988880157. Valid loss: 0.18977832794189453
2024-02-19 21:38:55,493 - root - INFO - Memory Used: 12172.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:39:53,707 - root - INFO - Time taken for epoch 25 is 58.21380591392517 sec
2024-02-19 21:39:53,707 - root - INFO - Train loss: 0.20906652510166168. Valid loss: 0.19015605747699738
2024-02-19 21:39:53,740 - root - INFO - Memory Used: 12167.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:40:52,269 - root - INFO - Time taken for epoch 26 is 58.528440713882446 sec
2024-02-19 21:40:52,269 - root - INFO - Train loss: 0.2086149901151657. Valid loss: 0.19145245850086212
2024-02-19 21:40:52,310 - root - INFO - Memory Used: 12168.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 21:41:50,732 - root - INFO - Time taken for epoch 27 is 58.4215042591095 sec
2024-02-19 21:41:50,733 - root - INFO - Train loss: 0.20688396692276. Valid loss: 0.19239431619644165
2024-02-19 21:41:50,766 - root - INFO - Memory Used: 12235.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:16:21,673 - root - INFO - --------------- Versions ---------------
2024-02-19 22:16:21,679 - root - INFO - git branch: b'* master'
2024-02-19 22:16:21,683 - root - INFO - git hash: b'1fa9d2703b6a8600c550e8de9d48a3f4ad44a6f3'
2024-02-19 22:16:21,683 - root - INFO - Torch: 2.1.2+cu118
2024-02-19 22:16:21,683 - root - INFO - ----------------------------------------
2024-02-19 22:16:21,683 - root - INFO - ------------------ Configuration ------------------
2024-02-19 22:16:21,683 - root - INFO - Configuration file: /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/AFNO.yaml
2024-02-19 22:16:21,683 - root - INFO - Configuration name: afno_backbone_ljkj
2024-02-19 22:16:21,684 - root - INFO - log_to_wandb False
2024-02-19 22:16:21,684 - root - INFO - lr 0.0005
2024-02-19 22:16:21,684 - root - INFO - batch_size 8
2024-02-19 22:16:21,684 - root - INFO - patch_size 4
2024-02-19 22:16:21,684 - root - INFO - depth 6
2024-02-19 22:16:21,684 - root - INFO - img_size [192, 288]
2024-02-19 22:16:21,684 - root - INFO - max_epochs 1500
2024-02-19 22:16:21,685 - root - INFO - scheduler CosineAnnealingLR
2024-02-19 22:16:21,685 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:16:21,685 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:16:21,685 - root - INFO - prediction_length 100
2024-02-19 22:16:21,685 - root - INFO - orography False
2024-02-19 22:16:21,685 - root - INFO - orography_path None
2024-02-19 22:16:21,685 - root - INFO - exp_dir ./results/tec_256
2024-02-19 22:16:21,685 - root - INFO - train_data_path ./train
2024-02-19 22:16:21,685 - root - INFO - valid_data_path ./test
2024-02-19 22:16:21,685 - root - INFO - inf_data_path ./out_of_sample
2024-02-19 22:16:21,686 - root - INFO - time_means_path ./time_means.npy
2024-02-19 22:16:21,686 - root - INFO - global_means_path ./global_means.npy
2024-02-19 22:16:21,686 - root - INFO - global_stds_path ./global_stds.npy
2024-02-19 22:16:21,686 - root - INFO - loss l2
2024-02-19 22:16:21,686 - root - INFO - num_data_workers 4
2024-02-19 22:16:21,686 - root - INFO - dt 1
2024-02-19 22:16:21,686 - root - INFO - n_history 0
2024-02-19 22:16:21,686 - root - INFO - prediction_type iterative
2024-02-19 22:16:21,686 - root - INFO - n_initial_conditions 5
2024-02-19 22:16:21,686 - root - INFO - ics_type default
2024-02-19 22:16:21,686 - root - INFO - save_raw_forecasts True
2024-02-19 22:16:21,686 - root - INFO - save_channel False
2024-02-19 22:16:21,686 - root - INFO - masked_acc False
2024-02-19 22:16:21,686 - root - INFO - maskpath None
2024-02-19 22:16:21,687 - root - INFO - perturb False
2024-02-19 22:16:21,687 - root - INFO - add_grid False
2024-02-19 22:16:21,687 - root - INFO - N_grid_channels 0
2024-02-19 22:16:21,687 - root - INFO - gridtype sinusoidal
2024-02-19 22:16:21,687 - root - INFO - roll False
2024-02-19 22:16:21,687 - root - INFO - num_blocks 8
2024-02-19 22:16:21,687 - root - INFO - nettype afno
2024-02-19 22:16:21,687 - root - INFO - width 56
2024-02-19 22:16:21,687 - root - INFO - modes 32
2024-02-19 22:16:21,687 - root - INFO - target default
2024-02-19 22:16:21,687 - root - INFO - normalization zscore
2024-02-19 22:16:21,687 - root - INFO - log_to_screen True
2024-02-19 22:16:21,687 - root - INFO - save_checkpoint True
2024-02-19 22:16:21,688 - root - INFO - enable_nhwc False
2024-02-19 22:16:21,688 - root - INFO - optimizer_type FusedAdam
2024-02-19 22:16:21,688 - root - INFO - crop_size_x None
2024-02-19 22:16:21,688 - root - INFO - crop_size_y None
2024-02-19 22:16:21,688 - root - INFO - two_step_training False
2024-02-19 22:16:21,688 - root - INFO - plot_animations False
2024-02-19 22:16:21,688 - root - INFO - add_noise False
2024-02-19 22:16:21,688 - root - INFO - noise_std 0
2024-02-19 22:16:21,688 - root - INFO - epsilon_factor 0
2024-02-19 22:16:21,688 - root - INFO - world_size 1
2024-02-19 22:16:21,688 - root - INFO - experiment_dir /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/results/tec_256/afno_backbone_ljkj/d6p_test1
2024-02-19 22:16:21,688 - root - INFO - checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:16:21,689 - root - INFO - best_checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/best_ckpt.tar
2024-02-19 22:16:21,689 - root - INFO - resuming True
2024-02-19 22:16:21,689 - root - INFO - local_rank 0
2024-02-19 22:16:21,689 - root - INFO - enable_amp True
2024-02-19 22:16:21,689 - root - INFO - name afno_backbone_ljkj_d6p_test1
2024-02-19 22:16:21,689 - root - INFO - group era5_precipafno_backbone_ljkj
2024-02-19 22:16:21,689 - root - INFO - project ERA5_precip
2024-02-19 22:16:21,689 - root - INFO - entity flowgan
2024-02-19 22:16:21,689 - root - INFO - ---------------------------------------------------
2024-02-19 22:16:21,705 - root - INFO - rank 0, begin data loader init
2024-02-19 22:16:26,950 - root - INFO - Getting file stats from ./train/2010.h5
2024-02-19 22:16:26,951 - root - INFO - Number of samples per year: 2360
2024-02-19 22:16:26,951 - root - INFO - Found data at path ./train. Number of examples: 2360. Image Shape: 192 x 288 x 13
2024-02-19 22:16:26,951 - root - INFO - Delta t: 6 hours
2024-02-19 22:16:26,951 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:16:26,952 - root - INFO - Getting file stats from ./test/2010.h5
2024-02-19 22:16:26,952 - root - INFO - Number of samples per year: 472
2024-02-19 22:16:26,952 - root - INFO - Found data at path ./test. Number of examples: 472. Image Shape: 192 x 288 x 13
2024-02-19 22:16:26,953 - root - INFO - Delta t: 6 hours
2024-02-19 22:16:26,953 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:16:26,953 - root - INFO - rank 0, data loader initialized
2024-02-19 22:20:04,336 - root - INFO - --------------- Versions ---------------
2024-02-19 22:20:04,342 - root - INFO - git branch: b'* master'
2024-02-19 22:20:04,345 - root - INFO - git hash: b'1fa9d2703b6a8600c550e8de9d48a3f4ad44a6f3'
2024-02-19 22:20:04,345 - root - INFO - Torch: 2.1.2+cu118
2024-02-19 22:20:04,345 - root - INFO - ----------------------------------------
2024-02-19 22:20:04,345 - root - INFO - ------------------ Configuration ------------------
2024-02-19 22:20:04,345 - root - INFO - Configuration file: /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/AFNO.yaml
2024-02-19 22:20:04,346 - root - INFO - Configuration name: afno_backbone_ljkj
2024-02-19 22:20:04,346 - root - INFO - log_to_wandb False
2024-02-19 22:20:04,346 - root - INFO - lr 0.0005
2024-02-19 22:20:04,346 - root - INFO - batch_size 8
2024-02-19 22:20:04,346 - root - INFO - patch_size 4
2024-02-19 22:20:04,346 - root - INFO - depth 6
2024-02-19 22:20:04,346 - root - INFO - img_size [192, 288]
2024-02-19 22:20:04,346 - root - INFO - max_epochs 1500
2024-02-19 22:20:04,346 - root - INFO - scheduler CosineAnnealingLR
2024-02-19 22:20:04,346 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:20:04,346 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:20:04,346 - root - INFO - prediction_length 100
2024-02-19 22:20:04,346 - root - INFO - orography False
2024-02-19 22:20:04,346 - root - INFO - orography_path None
2024-02-19 22:20:04,346 - root - INFO - exp_dir ./results/tec_256
2024-02-19 22:20:04,346 - root - INFO - train_data_path ./train
2024-02-19 22:20:04,346 - root - INFO - valid_data_path ./test
2024-02-19 22:20:04,346 - root - INFO - inf_data_path ./out_of_sample
2024-02-19 22:20:04,346 - root - INFO - time_means_path ./time_means.npy
2024-02-19 22:20:04,346 - root - INFO - global_means_path ./global_means.npy
2024-02-19 22:20:04,346 - root - INFO - global_stds_path ./global_stds.npy
2024-02-19 22:20:04,346 - root - INFO - loss l2
2024-02-19 22:20:04,346 - root - INFO - num_data_workers 4
2024-02-19 22:20:04,346 - root - INFO - dt 1
2024-02-19 22:20:04,346 - root - INFO - n_history 0
2024-02-19 22:20:04,346 - root - INFO - prediction_type iterative
2024-02-19 22:20:04,346 - root - INFO - n_initial_conditions 5
2024-02-19 22:20:04,346 - root - INFO - ics_type default
2024-02-19 22:20:04,346 - root - INFO - save_raw_forecasts True
2024-02-19 22:20:04,346 - root - INFO - save_channel False
2024-02-19 22:20:04,346 - root - INFO - masked_acc False
2024-02-19 22:20:04,346 - root - INFO - maskpath None
2024-02-19 22:20:04,346 - root - INFO - perturb False
2024-02-19 22:20:04,347 - root - INFO - add_grid False
2024-02-19 22:20:04,347 - root - INFO - N_grid_channels 0
2024-02-19 22:20:04,347 - root - INFO - gridtype sinusoidal
2024-02-19 22:20:04,347 - root - INFO - roll False
2024-02-19 22:20:04,347 - root - INFO - num_blocks 8
2024-02-19 22:20:04,347 - root - INFO - nettype afno
2024-02-19 22:20:04,347 - root - INFO - width 56
2024-02-19 22:20:04,347 - root - INFO - modes 32
2024-02-19 22:20:04,347 - root - INFO - target default
2024-02-19 22:20:04,347 - root - INFO - normalization zscore
2024-02-19 22:20:04,347 - root - INFO - log_to_screen True
2024-02-19 22:20:04,347 - root - INFO - save_checkpoint True
2024-02-19 22:20:04,347 - root - INFO - enable_nhwc False
2024-02-19 22:20:04,347 - root - INFO - optimizer_type FusedAdam
2024-02-19 22:20:04,347 - root - INFO - crop_size_x None
2024-02-19 22:20:04,347 - root - INFO - crop_size_y None
2024-02-19 22:20:04,347 - root - INFO - two_step_training False
2024-02-19 22:20:04,347 - root - INFO - plot_animations False
2024-02-19 22:20:04,347 - root - INFO - add_noise False
2024-02-19 22:20:04,347 - root - INFO - noise_std 0
2024-02-19 22:20:04,347 - root - INFO - epsilon_factor 0
2024-02-19 22:20:04,347 - root - INFO - world_size 1
2024-02-19 22:20:04,347 - root - INFO - experiment_dir /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/results/tec_256/afno_backbone_ljkj/d6p_test1
2024-02-19 22:20:04,347 - root - INFO - checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:20:04,347 - root - INFO - best_checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/best_ckpt.tar
2024-02-19 22:20:04,347 - root - INFO - resuming True
2024-02-19 22:20:04,347 - root - INFO - local_rank 0
2024-02-19 22:20:04,347 - root - INFO - enable_amp True
2024-02-19 22:20:04,347 - root - INFO - name afno_backbone_ljkj_d6p_test1
2024-02-19 22:20:04,347 - root - INFO - group era5_precipafno_backbone_ljkj
2024-02-19 22:20:04,347 - root - INFO - project ERA5_precip
2024-02-19 22:20:04,347 - root - INFO - entity flowgan
2024-02-19 22:20:04,347 - root - INFO - ---------------------------------------------------
2024-02-19 22:20:04,351 - root - INFO - rank 0, begin data loader init
2024-02-19 22:20:04,352 - root - INFO - Getting file stats from ./train/2010.h5
2024-02-19 22:20:04,352 - root - INFO - Number of samples per year: 2360
2024-02-19 22:20:04,352 - root - INFO - Found data at path ./train. Number of examples: 2360. Image Shape: 192 x 288 x 13
2024-02-19 22:20:04,352 - root - INFO - Delta t: 6 hours
2024-02-19 22:20:04,352 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:20:04,353 - root - INFO - Getting file stats from ./test/2010.h5
2024-02-19 22:20:04,353 - root - INFO - Number of samples per year: 472
2024-02-19 22:20:04,353 - root - INFO - Found data at path ./test. Number of examples: 472. Image Shape: 192 x 288 x 13
2024-02-19 22:20:04,353 - root - INFO - Delta t: 6 hours
2024-02-19 22:20:04,353 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:20:04,353 - root - INFO - rank 0, data loader initialized
2024-02-19 22:20:04,815 - root - INFO - Loading checkpoint ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:20:05,000 - root - INFO - Number of trainable model parameters: 33116928
2024-02-19 22:20:05,000 - root - INFO - Starting Training Loop...
2024-02-19 22:21:05,964 - root - INFO - Time taken for epoch 28 is 60.964223861694336 sec
2024-02-19 22:21:05,965 - root - INFO - Train loss: 0.20620867609977722. Valid loss: 0.19156751036643982
2024-02-19 22:21:06,029 - root - INFO - Memory Used: 7500.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:22:04,676 - root - INFO - Time taken for epoch 29 is 58.64586615562439 sec
2024-02-19 22:22:04,692 - root - INFO - Train loss: 0.2049102783203125. Valid loss: 0.1927620768547058
2024-02-19 22:22:04,726 - root - INFO - Memory Used: 11838.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:23:05,237 - root - INFO - Time taken for epoch 30 is 60.50999855995178 sec
2024-02-19 22:23:05,237 - root - INFO - Train loss: 0.20205412805080414. Valid loss: 0.19300618767738342
2024-02-19 22:23:05,276 - root - INFO - Memory Used: 11844.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:24:03,647 - root - INFO - Time taken for epoch 31 is 58.37112832069397 sec
2024-02-19 22:24:03,648 - root - INFO - Train loss: 0.20139522850513458. Valid loss: 0.19526340067386627
2024-02-19 22:24:03,737 - root - INFO - Memory Used: 11870.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:25:01,608 - root - INFO - Time taken for epoch 32 is 57.87055420875549 sec
2024-02-19 22:25:01,609 - root - INFO - Train loss: 0.20071007311344147. Valid loss: 0.19637790322303772
2024-02-19 22:25:01,651 - root - INFO - Memory Used: 11866.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:25:59,747 - root - INFO - Time taken for epoch 33 is 58.09547781944275 sec
2024-02-19 22:25:59,747 - root - INFO - Train loss: 0.20045530796051025. Valid loss: 0.19204077124595642
2024-02-19 22:25:59,828 - root - INFO - Memory Used: 11872.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:26:58,039 - root - INFO - Time taken for epoch 34 is 58.21076250076294 sec
2024-02-19 22:26:58,040 - root - INFO - Train loss: 0.19876112043857574. Valid loss: 0.19243396818637848
2024-02-19 22:26:58,080 - root - INFO - Memory Used: 11862.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:28:02,264 - root - INFO - --------------- Versions ---------------
2024-02-19 22:28:02,270 - root - INFO - git branch: b'* master'
2024-02-19 22:28:02,275 - root - INFO - git hash: b'1fa9d2703b6a8600c550e8de9d48a3f4ad44a6f3'
2024-02-19 22:28:02,275 - root - INFO - Torch: 2.1.2+cu118
2024-02-19 22:28:02,276 - root - INFO - ----------------------------------------
2024-02-19 22:28:02,276 - root - INFO - ------------------ Configuration ------------------
2024-02-19 22:28:02,276 - root - INFO - Configuration file: /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/AFNO.yaml
2024-02-19 22:28:02,276 - root - INFO - Configuration name: afno_backbone_ljkj
2024-02-19 22:28:02,276 - root - INFO - log_to_wandb False
2024-02-19 22:28:02,276 - root - INFO - lr 0.0005
2024-02-19 22:28:02,276 - root - INFO - batch_size 8
2024-02-19 22:28:02,276 - root - INFO - patch_size 4
2024-02-19 22:28:02,276 - root - INFO - depth 6
2024-02-19 22:28:02,276 - root - INFO - img_size [192, 288]
2024-02-19 22:28:02,276 - root - INFO - max_epochs 1500
2024-02-19 22:28:02,276 - root - INFO - scheduler CosineAnnealingLR
2024-02-19 22:28:02,276 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:28:02,276 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:28:02,277 - root - INFO - prediction_length 100
2024-02-19 22:28:02,277 - root - INFO - orography False
2024-02-19 22:28:02,277 - root - INFO - orography_path None
2024-02-19 22:28:02,277 - root - INFO - exp_dir ./results/tec_256
2024-02-19 22:28:02,277 - root - INFO - train_data_path ./train
2024-02-19 22:28:02,277 - root - INFO - valid_data_path ./test
2024-02-19 22:28:02,277 - root - INFO - inf_data_path ./out_of_sample
2024-02-19 22:28:02,277 - root - INFO - time_means_path ./time_means.npy
2024-02-19 22:28:02,277 - root - INFO - global_means_path ./global_means.npy
2024-02-19 22:28:02,277 - root - INFO - global_stds_path ./global_stds.npy
2024-02-19 22:28:02,277 - root - INFO - loss l2
2024-02-19 22:28:02,277 - root - INFO - num_data_workers 4
2024-02-19 22:28:02,277 - root - INFO - dt 1
2024-02-19 22:28:02,277 - root - INFO - n_history 0
2024-02-19 22:28:02,278 - root - INFO - prediction_type iterative
2024-02-19 22:28:02,278 - root - INFO - n_initial_conditions 5
2024-02-19 22:28:02,278 - root - INFO - ics_type default
2024-02-19 22:28:02,278 - root - INFO - save_raw_forecasts True
2024-02-19 22:28:02,278 - root - INFO - save_channel False
2024-02-19 22:28:02,278 - root - INFO - masked_acc False
2024-02-19 22:28:02,278 - root - INFO - maskpath None
2024-02-19 22:28:02,278 - root - INFO - perturb False
2024-02-19 22:28:02,278 - root - INFO - add_grid False
2024-02-19 22:28:02,278 - root - INFO - N_grid_channels 0
2024-02-19 22:28:02,278 - root - INFO - gridtype sinusoidal
2024-02-19 22:28:02,278 - root - INFO - roll False
2024-02-19 22:28:02,278 - root - INFO - num_blocks 8
2024-02-19 22:28:02,278 - root - INFO - nettype afno
2024-02-19 22:28:02,278 - root - INFO - width 56
2024-02-19 22:28:02,279 - root - INFO - modes 32
2024-02-19 22:28:02,279 - root - INFO - target default
2024-02-19 22:28:02,279 - root - INFO - normalization zscore
2024-02-19 22:28:02,279 - root - INFO - log_to_screen True
2024-02-19 22:28:02,279 - root - INFO - save_checkpoint True
2024-02-19 22:28:02,279 - root - INFO - enable_nhwc False
2024-02-19 22:28:02,279 - root - INFO - optimizer_type FusedAdam
2024-02-19 22:28:02,279 - root - INFO - crop_size_x None
2024-02-19 22:28:02,279 - root - INFO - crop_size_y None
2024-02-19 22:28:02,279 - root - INFO - two_step_training False
2024-02-19 22:28:02,279 - root - INFO - plot_animations False
2024-02-19 22:28:02,279 - root - INFO - add_noise False
2024-02-19 22:28:02,279 - root - INFO - noise_std 0
2024-02-19 22:28:02,279 - root - INFO - epsilon_factor 0
2024-02-19 22:28:02,279 - root - INFO - world_size 1
2024-02-19 22:28:02,280 - root - INFO - experiment_dir /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/results/tec_256/afno_backbone_ljkj/d6p_test1
2024-02-19 22:28:02,280 - root - INFO - checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:28:02,280 - root - INFO - best_checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/best_ckpt.tar
2024-02-19 22:28:02,280 - root - INFO - resuming True
2024-02-19 22:28:02,280 - root - INFO - local_rank 0
2024-02-19 22:28:02,280 - root - INFO - enable_amp True
2024-02-19 22:28:02,280 - root - INFO - name afno_backbone_ljkj_d6p_test1
2024-02-19 22:28:02,280 - root - INFO - group era5_precipafno_backbone_ljkj
2024-02-19 22:28:02,280 - root - INFO - project ERA5_precip
2024-02-19 22:28:02,280 - root - INFO - entity flowgan
2024-02-19 22:28:02,280 - root - INFO - ---------------------------------------------------
2024-02-19 22:28:02,295 - root - INFO - rank 0, begin data loader init
2024-02-19 22:28:02,296 - root - INFO - Getting file stats from ./train/2010.h5
2024-02-19 22:28:02,297 - root - INFO - Number of samples per year: 2360
2024-02-19 22:28:02,297 - root - INFO - Found data at path ./train. Number of examples: 2360. Image Shape: 192 x 288 x 13
2024-02-19 22:28:02,297 - root - INFO - Delta t: 6 hours
2024-02-19 22:28:02,297 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:28:02,298 - root - INFO - Getting file stats from ./test/2010.h5
2024-02-19 22:28:02,298 - root - INFO - Number of samples per year: 472
2024-02-19 22:28:02,298 - root - INFO - Found data at path ./test. Number of examples: 472. Image Shape: 192 x 288 x 13
2024-02-19 22:28:02,298 - root - INFO - Delta t: 6 hours
2024-02-19 22:28:02,298 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:28:02,298 - root - INFO - rank 0, data loader initialized
2024-02-19 22:41:08,880 - root - INFO - --------------- Versions ---------------
2024-02-19 22:41:08,887 - root - INFO - git branch: b'* master'
2024-02-19 22:41:08,892 - root - INFO - git hash: b'1fa9d2703b6a8600c550e8de9d48a3f4ad44a6f3'
2024-02-19 22:41:08,892 - root - INFO - Torch: 2.1.2+cu118
2024-02-19 22:41:08,892 - root - INFO - ----------------------------------------
2024-02-19 22:41:08,892 - root - INFO - ------------------ Configuration ------------------
2024-02-19 22:41:08,892 - root - INFO - Configuration file: /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/AFNO.yaml
2024-02-19 22:41:08,892 - root - INFO - Configuration name: afno_backbone_ljkj
2024-02-19 22:41:08,892 - root - INFO - log_to_wandb False
2024-02-19 22:41:08,893 - root - INFO - lr 0.0005
2024-02-19 22:41:08,893 - root - INFO - batch_size 8
2024-02-19 22:41:08,893 - root - INFO - patch_size 4
2024-02-19 22:41:08,893 - root - INFO - depth 6
2024-02-19 22:41:08,893 - root - INFO - img_size [192, 288]
2024-02-19 22:41:08,893 - root - INFO - max_epochs 1500
2024-02-19 22:41:08,893 - root - INFO - scheduler CosineAnnealingLR
2024-02-19 22:41:08,893 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:41:08,893 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:41:08,893 - root - INFO - prediction_length 100
2024-02-19 22:41:08,893 - root - INFO - orography False
2024-02-19 22:41:08,893 - root - INFO - orography_path None
2024-02-19 22:41:08,893 - root - INFO - exp_dir ./results/tec_256
2024-02-19 22:41:08,893 - root - INFO - train_data_path ./train
2024-02-19 22:41:08,894 - root - INFO - valid_data_path ./test
2024-02-19 22:41:08,894 - root - INFO - inf_data_path ./out_of_sample
2024-02-19 22:41:08,894 - root - INFO - time_means_path ./time_means.npy
2024-02-19 22:41:08,894 - root - INFO - global_means_path ./global_means.npy
2024-02-19 22:41:08,894 - root - INFO - global_stds_path ./global_stds.npy
2024-02-19 22:41:08,894 - root - INFO - loss l2
2024-02-19 22:41:08,894 - root - INFO - num_data_workers 4
2024-02-19 22:41:08,894 - root - INFO - dt 1
2024-02-19 22:41:08,894 - root - INFO - n_history 0
2024-02-19 22:41:08,894 - root - INFO - prediction_type iterative
2024-02-19 22:41:08,894 - root - INFO - n_initial_conditions 5
2024-02-19 22:41:08,894 - root - INFO - ics_type default
2024-02-19 22:41:08,894 - root - INFO - save_raw_forecasts True
2024-02-19 22:41:08,894 - root - INFO - save_channel False
2024-02-19 22:41:08,894 - root - INFO - masked_acc False
2024-02-19 22:41:08,895 - root - INFO - maskpath None
2024-02-19 22:41:08,895 - root - INFO - perturb False
2024-02-19 22:41:08,895 - root - INFO - add_grid False
2024-02-19 22:41:08,895 - root - INFO - N_grid_channels 0
2024-02-19 22:41:08,895 - root - INFO - gridtype sinusoidal
2024-02-19 22:41:08,895 - root - INFO - roll False
2024-02-19 22:41:08,895 - root - INFO - num_blocks 8
2024-02-19 22:41:08,895 - root - INFO - nettype afno
2024-02-19 22:41:08,895 - root - INFO - width 56
2024-02-19 22:41:08,895 - root - INFO - modes 32
2024-02-19 22:41:08,895 - root - INFO - target default
2024-02-19 22:41:08,895 - root - INFO - normalization zscore
2024-02-19 22:41:08,895 - root - INFO - log_to_screen True
2024-02-19 22:41:08,895 - root - INFO - save_checkpoint True
2024-02-19 22:41:08,895 - root - INFO - enable_nhwc False
2024-02-19 22:41:08,896 - root - INFO - optimizer_type FusedAdam
2024-02-19 22:41:08,896 - root - INFO - crop_size_x None
2024-02-19 22:41:08,896 - root - INFO - crop_size_y None
2024-02-19 22:41:08,896 - root - INFO - two_step_training False
2024-02-19 22:41:08,896 - root - INFO - plot_animations False
2024-02-19 22:41:08,896 - root - INFO - add_noise False
2024-02-19 22:41:08,896 - root - INFO - noise_std 0
2024-02-19 22:41:08,896 - root - INFO - epsilon_factor 0
2024-02-19 22:41:08,896 - root - INFO - world_size 1
2024-02-19 22:41:08,896 - root - INFO - experiment_dir /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/results/tec_256/afno_backbone_ljkj/d6p_test1
2024-02-19 22:41:08,896 - root - INFO - checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:41:08,896 - root - INFO - best_checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/best_ckpt.tar
2024-02-19 22:41:08,896 - root - INFO - resuming True
2024-02-19 22:41:08,896 - root - INFO - local_rank 0
2024-02-19 22:41:08,896 - root - INFO - enable_amp True
2024-02-19 22:41:08,897 - root - INFO - name afno_backbone_ljkj_d6p_test1
2024-02-19 22:41:08,897 - root - INFO - group era5_precipafno_backbone_ljkj
2024-02-19 22:41:08,897 - root - INFO - project ERA5_precip
2024-02-19 22:41:08,897 - root - INFO - entity flowgan
2024-02-19 22:41:08,897 - root - INFO - ---------------------------------------------------
2024-02-19 22:41:08,912 - root - INFO - rank 0, begin data loader init
2024-02-19 22:41:08,913 - root - INFO - Getting file stats from ./train/2010.h5
2024-02-19 22:41:08,913 - root - INFO - Number of samples per year: 2360
2024-02-19 22:41:08,913 - root - INFO - Found data at path ./train. Number of examples: 2360. Image Shape: 192 x 288 x 13
2024-02-19 22:41:08,913 - root - INFO - Delta t: 6 hours
2024-02-19 22:41:08,913 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:41:08,914 - root - INFO - Getting file stats from ./test/2010.h5
2024-02-19 22:41:08,915 - root - INFO - Number of samples per year: 472
2024-02-19 22:41:08,915 - root - INFO - Found data at path ./test. Number of examples: 472. Image Shape: 192 x 288 x 13
2024-02-19 22:41:08,915 - root - INFO - Delta t: 6 hours
2024-02-19 22:41:08,915 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:41:08,915 - root - INFO - rank 0, data loader initialized
2024-02-19 22:41:41,371 - root - INFO - --------------- Versions ---------------
2024-02-19 22:41:41,378 - root - INFO - git branch: b'* master'
2024-02-19 22:41:41,382 - root - INFO - git hash: b'1fa9d2703b6a8600c550e8de9d48a3f4ad44a6f3'
2024-02-19 22:41:41,382 - root - INFO - Torch: 2.1.2+cu118
2024-02-19 22:41:41,382 - root - INFO - ----------------------------------------
2024-02-19 22:41:41,382 - root - INFO - ------------------ Configuration ------------------
2024-02-19 22:41:41,383 - root - INFO - Configuration file: /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/AFNO.yaml
2024-02-19 22:41:41,383 - root - INFO - Configuration name: afno_backbone_ljkj
2024-02-19 22:41:41,383 - root - INFO - log_to_wandb False
2024-02-19 22:41:41,383 - root - INFO - lr 0.0005
2024-02-19 22:41:41,383 - root - INFO - batch_size 8
2024-02-19 22:41:41,383 - root - INFO - patch_size 4
2024-02-19 22:41:41,383 - root - INFO - depth 6
2024-02-19 22:41:41,383 - root - INFO - img_size [192, 288]
2024-02-19 22:41:41,383 - root - INFO - max_epochs 1500
2024-02-19 22:41:41,383 - root - INFO - scheduler CosineAnnealingLR
2024-02-19 22:41:41,383 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:41:41,383 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:41:41,383 - root - INFO - prediction_length 100
2024-02-19 22:41:41,383 - root - INFO - orography False
2024-02-19 22:41:41,384 - root - INFO - orography_path None
2024-02-19 22:41:41,384 - root - INFO - exp_dir ./results/tec_256
2024-02-19 22:41:41,384 - root - INFO - train_data_path ./train
2024-02-19 22:41:41,384 - root - INFO - valid_data_path ./test
2024-02-19 22:41:41,384 - root - INFO - inf_data_path ./out_of_sample
2024-02-19 22:41:41,384 - root - INFO - time_means_path ./time_means.npy
2024-02-19 22:41:41,384 - root - INFO - global_means_path ./global_means.npy
2024-02-19 22:41:41,384 - root - INFO - global_stds_path ./global_stds.npy
2024-02-19 22:41:41,384 - root - INFO - loss l2
2024-02-19 22:41:41,384 - root - INFO - retrain False
2024-02-19 22:41:41,384 - root - INFO - num_data_workers 4
2024-02-19 22:41:41,384 - root - INFO - dt 1
2024-02-19 22:41:41,384 - root - INFO - n_history 0
2024-02-19 22:41:41,385 - root - INFO - prediction_type iterative
2024-02-19 22:41:41,385 - root - INFO - n_initial_conditions 5
2024-02-19 22:41:41,385 - root - INFO - ics_type default
2024-02-19 22:41:41,385 - root - INFO - save_raw_forecasts True
2024-02-19 22:41:41,385 - root - INFO - save_channel False
2024-02-19 22:41:41,385 - root - INFO - masked_acc False
2024-02-19 22:41:41,385 - root - INFO - maskpath None
2024-02-19 22:41:41,385 - root - INFO - perturb False
2024-02-19 22:41:41,385 - root - INFO - add_grid False
2024-02-19 22:41:41,385 - root - INFO - N_grid_channels 0
2024-02-19 22:41:41,385 - root - INFO - gridtype sinusoidal
2024-02-19 22:41:41,385 - root - INFO - roll False
2024-02-19 22:41:41,385 - root - INFO - num_blocks 8
2024-02-19 22:41:41,385 - root - INFO - nettype afno
2024-02-19 22:41:41,386 - root - INFO - width 56
2024-02-19 22:41:41,386 - root - INFO - modes 32
2024-02-19 22:41:41,386 - root - INFO - target default
2024-02-19 22:41:41,386 - root - INFO - normalization zscore
2024-02-19 22:41:41,386 - root - INFO - log_to_screen True
2024-02-19 22:41:41,386 - root - INFO - save_checkpoint True
2024-02-19 22:41:41,386 - root - INFO - enable_nhwc False
2024-02-19 22:41:41,386 - root - INFO - optimizer_type FusedAdam
2024-02-19 22:41:41,386 - root - INFO - crop_size_x None
2024-02-19 22:41:41,386 - root - INFO - crop_size_y None
2024-02-19 22:41:41,386 - root - INFO - two_step_training False
2024-02-19 22:41:41,386 - root - INFO - plot_animations False
2024-02-19 22:41:41,386 - root - INFO - add_noise False
2024-02-19 22:41:41,386 - root - INFO - noise_std 0
2024-02-19 22:41:41,386 - root - INFO - epsilon_factor 0
2024-02-19 22:41:41,387 - root - INFO - world_size 1
2024-02-19 22:41:41,387 - root - INFO - experiment_dir /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/results/tec_256/afno_backbone_ljkj/d6p_test1
2024-02-19 22:41:41,387 - root - INFO - checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:41:41,387 - root - INFO - best_checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/best_ckpt.tar
2024-02-19 22:41:41,387 - root - INFO - resuming True
2024-02-19 22:41:41,387 - root - INFO - local_rank 0
2024-02-19 22:41:41,387 - root - INFO - enable_amp True
2024-02-19 22:41:41,387 - root - INFO - name afno_backbone_ljkj_d6p_test1
2024-02-19 22:41:41,387 - root - INFO - group era5_precipafno_backbone_ljkj
2024-02-19 22:41:41,387 - root - INFO - project ERA5_precip
2024-02-19 22:41:41,387 - root - INFO - entity flowgan
2024-02-19 22:41:41,387 - root - INFO - ---------------------------------------------------
2024-02-19 22:41:41,403 - root - INFO - rank 0, begin data loader init
2024-02-19 22:41:41,404 - root - INFO - Getting file stats from ./train/2010.h5
2024-02-19 22:41:41,404 - root - INFO - Number of samples per year: 2360
2024-02-19 22:41:41,404 - root - INFO - Found data at path ./train. Number of examples: 2360. Image Shape: 192 x 288 x 13
2024-02-19 22:41:41,404 - root - INFO - Delta t: 6 hours
2024-02-19 22:41:41,405 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:41:41,405 - root - INFO - Getting file stats from ./test/2010.h5
2024-02-19 22:41:41,405 - root - INFO - Number of samples per year: 472
2024-02-19 22:41:41,405 - root - INFO - Found data at path ./test. Number of examples: 472. Image Shape: 192 x 288 x 13
2024-02-19 22:41:41,406 - root - INFO - Delta t: 6 hours
2024-02-19 22:41:41,406 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:41:41,406 - root - INFO - rank 0, data loader initialized
2024-02-19 22:41:41,795 - root - INFO - Loading checkpoint ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:41:41,996 - root - INFO - Number of trainable model parameters: 33116928
2024-02-19 22:41:41,996 - root - INFO - Starting Training Loop...
2024-02-19 22:43:02,766 - root - INFO - --------------- Versions ---------------
2024-02-19 22:43:02,772 - root - INFO - git branch: b'* master'
2024-02-19 22:43:02,777 - root - INFO - git hash: b'1fa9d2703b6a8600c550e8de9d48a3f4ad44a6f3'
2024-02-19 22:43:02,777 - root - INFO - Torch: 2.1.2+cu118
2024-02-19 22:43:02,777 - root - INFO - ----------------------------------------
2024-02-19 22:43:02,777 - root - INFO - ------------------ Configuration ------------------
2024-02-19 22:43:02,777 - root - INFO - Configuration file: /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/AFNO.yaml
2024-02-19 22:43:02,777 - root - INFO - Configuration name: afno_backbone_ljkj
2024-02-19 22:43:02,777 - root - INFO - log_to_wandb False
2024-02-19 22:43:02,777 - root - INFO - lr 0.0005
2024-02-19 22:43:02,777 - root - INFO - batch_size 8
2024-02-19 22:43:02,777 - root - INFO - patch_size 4
2024-02-19 22:43:02,777 - root - INFO - depth 6
2024-02-19 22:43:02,777 - root - INFO - img_size [192, 288]
2024-02-19 22:43:02,778 - root - INFO - max_epochs 1500
2024-02-19 22:43:02,778 - root - INFO - scheduler CosineAnnealingLR
2024-02-19 22:43:02,778 - root - INFO - in_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:43:02,778 - root - INFO - out_channels [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2024-02-19 22:43:02,778 - root - INFO - prediction_length 100
2024-02-19 22:43:02,778 - root - INFO - orography False
2024-02-19 22:43:02,778 - root - INFO - orography_path None
2024-02-19 22:43:02,778 - root - INFO - exp_dir ./results/tec_256
2024-02-19 22:43:02,778 - root - INFO - train_data_path ./train
2024-02-19 22:43:02,778 - root - INFO - valid_data_path ./test
2024-02-19 22:43:02,778 - root - INFO - inf_data_path ./out_of_sample
2024-02-19 22:43:02,778 - root - INFO - time_means_path ./time_means.npy
2024-02-19 22:43:02,778 - root - INFO - global_means_path ./global_means.npy
2024-02-19 22:43:02,778 - root - INFO - global_stds_path ./global_stds.npy
2024-02-19 22:43:02,779 - root - INFO - loss l2
2024-02-19 22:43:02,779 - root - INFO - retrain False
2024-02-19 22:43:02,779 - root - INFO - num_data_workers 4
2024-02-19 22:43:02,779 - root - INFO - dt 1
2024-02-19 22:43:02,779 - root - INFO - n_history 0
2024-02-19 22:43:02,779 - root - INFO - prediction_type iterative
2024-02-19 22:43:02,779 - root - INFO - n_initial_conditions 5
2024-02-19 22:43:02,779 - root - INFO - ics_type default
2024-02-19 22:43:02,779 - root - INFO - save_raw_forecasts True
2024-02-19 22:43:02,779 - root - INFO - save_channel False
2024-02-19 22:43:02,779 - root - INFO - masked_acc False
2024-02-19 22:43:02,779 - root - INFO - maskpath None
2024-02-19 22:43:02,779 - root - INFO - perturb False
2024-02-19 22:43:02,779 - root - INFO - add_grid False
2024-02-19 22:43:02,779 - root - INFO - N_grid_channels 0
2024-02-19 22:43:02,780 - root - INFO - gridtype sinusoidal
2024-02-19 22:43:02,780 - root - INFO - roll False
2024-02-19 22:43:02,780 - root - INFO - num_blocks 8
2024-02-19 22:43:02,780 - root - INFO - nettype afno
2024-02-19 22:43:02,780 - root - INFO - width 56
2024-02-19 22:43:02,780 - root - INFO - modes 32
2024-02-19 22:43:02,780 - root - INFO - target default
2024-02-19 22:43:02,780 - root - INFO - normalization zscore
2024-02-19 22:43:02,780 - root - INFO - log_to_screen True
2024-02-19 22:43:02,780 - root - INFO - save_checkpoint True
2024-02-19 22:43:02,780 - root - INFO - enable_nhwc False
2024-02-19 22:43:02,780 - root - INFO - optimizer_type FusedAdam
2024-02-19 22:43:02,780 - root - INFO - crop_size_x None
2024-02-19 22:43:02,780 - root - INFO - crop_size_y None
2024-02-19 22:43:02,780 - root - INFO - two_step_training False
2024-02-19 22:43:02,781 - root - INFO - plot_animations False
2024-02-19 22:43:02,781 - root - INFO - add_noise False
2024-02-19 22:43:02,781 - root - INFO - noise_std 0
2024-02-19 22:43:02,781 - root - INFO - epsilon_factor 0
2024-02-19 22:43:02,781 - root - INFO - world_size 1
2024-02-19 22:43:02,781 - root - INFO - experiment_dir /home/cxt/work/fourcastnet_TEC/FourCastNetTEC/data_ljkj/results/tec_256/afno_backbone_ljkj/d6p_test1
2024-02-19 22:43:02,781 - root - INFO - checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:43:02,781 - root - INFO - best_checkpoint_path ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/best_ckpt.tar
2024-02-19 22:43:02,781 - root - INFO - resuming True
2024-02-19 22:43:02,781 - root - INFO - local_rank 0
2024-02-19 22:43:02,781 - root - INFO - enable_amp True
2024-02-19 22:43:02,781 - root - INFO - name afno_backbone_ljkj_d6p_test1
2024-02-19 22:43:02,781 - root - INFO - group era5_precipafno_backbone_ljkj
2024-02-19 22:43:02,781 - root - INFO - project ERA5_precip
2024-02-19 22:43:02,781 - root - INFO - entity flowgan
2024-02-19 22:43:02,782 - root - INFO - ---------------------------------------------------
2024-02-19 22:43:02,797 - root - INFO - rank 0, begin data loader init
2024-02-19 22:43:02,798 - root - INFO - Getting file stats from ./train/2010.h5
2024-02-19 22:43:02,798 - root - INFO - Number of samples per year: 2360
2024-02-19 22:43:02,798 - root - INFO - Found data at path ./train. Number of examples: 2360. Image Shape: 192 x 288 x 13
2024-02-19 22:43:02,798 - root - INFO - Delta t: 6 hours
2024-02-19 22:43:02,799 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:43:02,799 - root - INFO - Getting file stats from ./test/2010.h5
2024-02-19 22:43:02,799 - root - INFO - Number of samples per year: 472
2024-02-19 22:43:02,799 - root - INFO - Found data at path ./test. Number of examples: 472. Image Shape: 192 x 288 x 13
2024-02-19 22:43:02,799 - root - INFO - Delta t: 6 hours
2024-02-19 22:43:02,800 - root - INFO - Including 0 hours of past history in training at a frequency of 6 hours
2024-02-19 22:43:02,800 - root - INFO - rank 0, data loader initialized
2024-02-19 22:43:03,219 - root - INFO - Loading checkpoint ./results/tec_256/afno_backbone_ljkj/d6p_test1/training_checkpoints/ckpt.tar
2024-02-19 22:43:03,429 - root - INFO - Number of trainable model parameters: 33116928
2024-02-19 22:43:03,429 - root - INFO - Starting Training Loop...
2024-02-19 22:44:06,237 - root - INFO - Time taken for epoch 35 is 62.807788610458374 sec
2024-02-19 22:44:06,238 - root - INFO - Train loss: 0.19801630079746246. Valid loss: 0.1904757022857666
2024-02-19 22:44:06,308 - root - INFO - Memory Used: 7642.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
2024-02-19 22:45:27,484 - root - INFO - Time taken for epoch 36 is 81.1758120059967 sec
2024-02-19 22:45:27,485 - root - INFO - Train loss: 0.19783592224121094. Valid loss: 0.19136051833629608
2024-02-19 22:45:27,526 - root - INFO - Memory Used: 11985.0 MB, GPU.UUID: GPU-dead2cd9-3e2a-4455-2129-2efe429a641d
......@@ -84,6 +84,9 @@ import json
from ruamel.yaml import YAML
from ruamel.yaml.comments import CommentedMap as ruamelDict
def count_parameters(model):
return sum(p.numel() for p in model.parameters())
class Trainer():
def count_parameters(self):
return sum(p.numel() for p in self.model.parameters() if p.requires_grad)
......@@ -152,10 +155,14 @@ class Trainer():
if params.log_to_wandb:
wandb.watch(self.model)
if params.retrain:
model_parameters = self.model.patch_embed.parameters()
else:
model_parameters = self.model.parameters()
if params.optimizer_type == 'FusedAdam':
self.optimizer = optimizers.FusedAdam(self.model.parameters(), lr = params.lr)
self.optimizer = optimizers.FusedAdam(model_parameters, lr = params.lr)
else:
self.optimizer = torch.optim.Adam(self.model.parameters(), lr = params.lr)
self.optimizer = torch.optim.Adam(model_parameters, lr = params.lr)
if params.enable_amp == True:
self.gscaler = amp.GradScaler()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment