action: train_avclip debug: false model: target: model.modules.feat_extractors.train_clip_src.open_clip.model.AVCLIP params: init_scale: 0.07 clamp_scale_min: 0.001 clamp_scale_max: 0.5 n_embd: 768 gather_for_loss: false afeat_extractor: is_trainable: true target: model.modules.feat_extractors.audio.ast.AST params: ckpt_path: MIT/ast-finetuned-audioset-10-10-0.4593 extract_features: true max_spec_t: 66 factorize_freq_time: true agg_freq_module: TransformerEncoderLayer agg_time_module: AveragePooling add_global_repr: false agg_segments_module: AveragePooling max_segments: 14 vfeat_extractor: is_trainable: true target: model.modules.feat_extractors.visual.motionformer.MotionFormer params: ckpt_path: ./model/modules/feat_extractors/visual/motionformer_src/ssv2_divided_224_16x4.pyth extract_features: true factorize_space_time: true agg_space_module: TransformerEncoderLayer agg_time_module: AveragePooling add_global_repr: false agg_segments_module: AveragePooling max_segments: 14 aproj: target: model.modules.bridges.DoNothingBridge params: in_features: 768 out_features: 768 vproj: target: model.modules.bridges.DoNothingBridge params: in_features: 768 out_features: 768 training: resume: null learning_rate: 0.0001 lr_cooldown_end: 0.0 lr_cooldown_power: 1.0 base_batch_size: 2 queue_size: 0 for_loop_segment_fwd: false grad_checkpointing: false momentum: 0.995 num_workers: 7 num_epochs: 100 patience: 20 epochs_cooldown: null val_frequency: 1 compile: false to_max_metric: true metric_name: precision early_stop_phase: valid precision: amp alpha: 0.0 seed: 1337 run_test_only: false dist_backend: nccl dist_url: env:// ddp_static_graph: false no_set_device_rank: false remote_sync: null remote_sync_frequency: 300 remote_sync_protocol: s3 distill_model: null distill_pretrained: null force_image_size: null lock_rgb: false lock_audio: false lock_rgb_unlocked_groups: 0 lock_audio_unlocked_layers: 0 lock_rgb_freeze_bn_stats: false lock_audio_freeze_layer_norm: false trace: false use_bn_sync: false max_clip_norm: 1.0 init_scale: 0.07 clamp_scale_min: 0.001 clamp_scale_max: 0.5 run_shifted_win_val: true run_shifted_win_val_winsize_valid: 8 run_shifted_win_val_winsize_train: 8 segment_loss_weight: 1.0 global_loss_weight: 1.0 skip_scheduler: false lr_scheduler: name: cosine warmup: 1000 optimizer: name: adamw betas: - 0.9 - 0.999 momentum: 0.9 weight_decay: 0.0 data: dataset_type: sparsesync vids_path: /flash/project_462000293/vladimir/vggsound/h264_video_25fps_256side_16000hz_aac/ size_before_crop: 256 input_size: 224 segment_size_vframes: 16 is_spatial_crop_random: true is_temporal_crop_random: true sometimes_upscale_p: 0.2 p_horizontal_flip: 0.5 p_audio_aug: 0.2 p_color_jitter: 0.2 p_gray_scale: 0.2 n_segments_train: 14 n_segments_valid: 14 audio_jitter_sec: 0.05 step_size_seg: 1.0 dataset: target: dataset.vggsound.VGGSound params: load_fixed_offsets_on: [] vis_load_backend: read_video size_ratio: null transform_sequence_train: - target: dataset.transforms.EqualifyFromRight params: clip_max_len_sec: 10 - target: dataset.transforms.RGBSpatialCropSometimesUpscale params: sometimes_p: 0.2 smaller_input_size: 192 target_input_size: 224 is_random: true - target: dataset.transforms.GenerateMultipleSegments params: segment_size_vframes: 16 n_segments: 14 is_start_random: true audio_jitter_sec: 0.05 step_size_seg: 1.0 - target: dataset.transforms.RandomApplyColorDistortion params: p_color_jitter: 0.2 s: 1.0 p_gray_scale: 0.2 - target: dataset.transforms.RandomHorizontalFlip params: p: 0.5 - target: dataset.transforms.RGBToHalfToZeroOne - target: dataset.transforms.RGBNormalize params: mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 - target: dataset.transforms.AudioRandomReverb params: p: 0.2 - target: dataset.transforms.AudioRandomVolume params: p: 0.2 gain: 2.0 gain_type: amplitude - target: dataset.transforms.AudioRandomPitchShift params: p: 0.2 shift: 1000 - target: dataset.transforms.AudioRandomLowpassFilter params: p: 0.2 cutoff_freq: 100 - target: dataset.transforms.AudioRandomGaussNoise params: p: 0.2 amplitude: 0.01 - target: dataset.transforms.AudioMelSpectrogram params: sample_rate: 16000 win_length: 400 hop_length: 160 n_fft: 1024 n_mels: 128 - target: dataset.transforms.AudioLog - target: dataset.transforms.PadOrTruncate params: max_spec_t: 66 - target: dataset.transforms.AudioNormalizeAST params: mean: -4.2677393 std: 4.5689974 - target: dataset.transforms.PermuteStreams params: einops_order_audio: S F T -> S T F einops_order_rgb: S T C H W -> S C T H W transform_sequence_test: - target: dataset.transforms.EqualifyFromRight - target: dataset.transforms.RGBSpatialCrop params: input_size: 224 is_random: false - target: dataset.transforms.GenerateMultipleSegments params: segment_size_vframes: 16 n_segments: 14 is_start_random: false step_size_seg: 1.0 - target: dataset.transforms.RGBToFloatToZeroOne - target: dataset.transforms.RGBNormalize params: mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 - target: dataset.transforms.AudioMelSpectrogram params: sample_rate: 16000 win_length: 400 hop_length: 160 n_fft: 1024 n_mels: 128 - target: dataset.transforms.AudioLog - target: dataset.transforms.PadOrTruncate params: max_spec_t: 66 - target: dataset.transforms.AudioNormalizeAST params: mean: -4.2677393 std: 4.5689974 - target: dataset.transforms.PermuteStreams params: einops_order_audio: S F T -> S T F einops_order_rgb: S T C H W -> S C T H W logging: logdir: /scratch/project_462000293/vladimir/logs/sync/avclip_models log_code_state: true log_frequency: 100 log_local: false delete_previous_checkpoint: false save_most_recent: true save_frequency: 0 patterns_to_ignore: - logs - .git - __pycache__ - data - '*.pt' - '*.pyth' - sbatch_logs - '*.mp4' - '*.wav' - '*.jpg' - '*.gif' - misc* use_tboard: false use_wandb: true wandb_notes: null start_time: 23-12-22T16-10-50 config: ./configs/segment_avclip.yaml distributed: true world_size: 32 rank: 0 local_rank: 0 device: cuda:0 name: 23-12-22T16-10-50 log_path: /scratch/project_462000293/vladimir/logs/sync/avclip_models/23-12-22T16-10-50/out.log log_level: 20 checkpoint_path: /scratch/project_462000293/vladimir/logs/sync/avclip_models/23-12-22T16-10-50/checkpoints tensorboard_path: /scratch/project_462000293/vladimir/logs/sync/avclip_models/23-12-22T16-10-50 distill: false