You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
while training on single gpu, getting pin memory thread exited. it is happening after on round of train/val. I have made it to validate on 10 iters, so that this issue can be captured within time. please suggest the possible reasons for it. error snap is given below.....
2024-07-14 13:56:04,122 - mmedit - INFO - Checkpoints will be saved to /home/ubuntu/Desktop/RealBasicVSR/experiments/realbasicvsr_c64b20_1x30x8_lr5e-5_150k_reds by HardDiskBackend.
2024-07-14 13:56:04.586294: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable TF_ENABLE_ONEDNN_OPTS=0.
2024-07-14 13:56:04.636049: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-14 13:56:05.362593: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/nn/functional.py:3103: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details.
warnings.warn("The default behavior for interpolate/upsample with float scale_factor changed "
2024-07-14 13:56:32,710 - mmedit - INFO - Saving checkpoint at 10 iterations
1/1, 0.0 task/s, elapsed: 28s, ETA: 0s2024-07-14 13:57:02,335 - mmedit - INFO - Iter(val) [10] PSNR: 27.7156
2024-07-14 13:57:02,337 - mmedit - INFO - Iter(val) [10]
2024-07-14 13:57:23,808 - mmedit - INFO - Saving checkpoint at 20 iterations
[ ] 0/1, elapsed: 0s, ETA:Exception in thread Thread-3:
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
idx, data = r
ValueError: not enough values to unpack (expected 2, got 0)
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/.mim/tools/train.py", line 171, in
main()
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/.mim/tools/train.py", line 160, in main
train_model(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/apis/train.py", line 107, in train_model
_non_dist_train(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/apis/train.py", line 363, in _non_dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_iters)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 144, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 70, in train
self.call_hook('after_train_iter')
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmcv/runner/base_runner.py", line 317, in call_hook
getattr(hook, fn_name)(self)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/core/evaluation/eval_hooks.py", line 42, in after_train_iter
results = single_gpu_test(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/apis/test.py", line 41, in single_gpu_test
for data in data_loader:
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 349, in iter
self._iterator._reset(self)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 852, in _reset
data = self._get_data()
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1029, in _get_data
raise RuntimeError('Pin memory thread exited unexpectedly')
RuntimeError: Pin memory thread exited unexpectedly
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/rbvsr/bin/mim", line 8, in
sys.exit(cli())
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1157, in call
return self.main(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mim/commands/train.py", line 100, in cli
is_success, msg = train(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mim/commands/train.py", line 261, in train
ret = subprocess.check_call(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/subprocess.py", line 364, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/home/ubuntu/anaconda3/envs/rbvsr/bin/python', '/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/.mim/tools/train.py', 'configs/realbasicvsr_c64b20_1x30x8_lr5e-5_150k_reds.py', '--launcher', 'none', '--gpus', '1']' returned non-zero exit status 1.
The text was updated successfully, but these errors were encountered:
while training on single gpu, getting pin memory thread exited. it is happening after on round of train/val. I have made it to validate on 10 iters, so that this issue can be captured within time. please suggest the possible reasons for it. error snap is given below.....
2024-07-14 13:56:04,122 - mmedit - INFO - Checkpoints will be saved to /home/ubuntu/Desktop/RealBasicVSR/experiments/realbasicvsr_c64b20_1x30x8_lr5e-5_150k_reds by HardDiskBackend.
2024-07-14 13:56:04.586294: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable
TF_ENABLE_ONEDNN_OPTS=0
.2024-07-14 13:56:04.636049: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-14 13:56:05.362593: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/nn/functional.py:3103: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details.
warnings.warn("The default behavior for interpolate/upsample with float scale_factor changed "
2024-07-14 13:56:32,710 - mmedit - INFO - Saving checkpoint at 10 iterations
1/1, 0.0 task/s, elapsed: 28s, ETA: 0s2024-07-14 13:57:02,335 - mmedit - INFO - Iter(val) [10] PSNR: 27.7156
2024-07-14 13:57:02,337 - mmedit - INFO - Iter(val) [10]
2024-07-14 13:57:23,808 - mmedit - INFO - Saving checkpoint at 20 iterations
[ ] 0/1, elapsed: 0s, ETA:Exception in thread Thread-3:
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
idx, data = r
ValueError: not enough values to unpack (expected 2, got 0)
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/.mim/tools/train.py", line 171, in
main()
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/.mim/tools/train.py", line 160, in main
train_model(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/apis/train.py", line 107, in train_model
_non_dist_train(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/apis/train.py", line 363, in _non_dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_iters)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 144, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 70, in train
self.call_hook('after_train_iter')
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmcv/runner/base_runner.py", line 317, in call_hook
getattr(hook, fn_name)(self)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/core/evaluation/eval_hooks.py", line 42, in after_train_iter
results = single_gpu_test(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/apis/test.py", line 41, in single_gpu_test
for data in data_loader:
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 349, in iter
self._iterator._reset(self)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 852, in _reset
data = self._get_data()
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1029, in _get_data
raise RuntimeError('Pin memory thread exited unexpectedly')
RuntimeError: Pin memory thread exited unexpectedly
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/rbvsr/bin/mim", line 8, in
sys.exit(cli())
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1157, in call
return self.main(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mim/commands/train.py", line 100, in cli
is_success, msg = train(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mim/commands/train.py", line 261, in train
ret = subprocess.check_call(
File "/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/subprocess.py", line 364, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/home/ubuntu/anaconda3/envs/rbvsr/bin/python', '/home/ubuntu/anaconda3/envs/rbvsr/lib/python3.8/site-packages/mmedit/.mim/tools/train.py', 'configs/realbasicvsr_c64b20_1x30x8_lr5e-5_150k_reds.py', '--launcher', 'none', '--gpus', '1']' returned non-zero exit status 1.
The text was updated successfully, but these errors were encountered: