-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_topsStack_files.sh
executable file
·216 lines (194 loc) · 10.4 KB
/
clean_topsStack_files.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/bin/bash
# Need to run script from within run_files directory for a topsStack run
# Delete everything with this command
# ./clean_topsStack_files.sh --coarse_igram --burst_slc --ion_burst_slc --ion_burst_igram --ion_split_igram --geom_reference --coreg_offset -coreg_overlap --esd
# Otherwise you can add specific deletions to the end of particular sbatch files so that we don't end up hitting disk quotas during processing
# Usinging 'find' and 'xargs' to deal with very large lists of files (ls/du/rm will fail when over ~300k files)
# TODO pass the stage at which this script is called, and write that to the log file
# If delete_files_bool is false we just write the files that would be deleted to the logfile, along with their sizes
# If true, we actually erase files
##############################
delete_files_bool="true"
##############################
step_start=`date +%s`
current_dir=$(pwd)
cd .. # Assuming that we'll be running this from 'run_files'
printf -v date '%(%Y-%m-%d)T' -1 # Store this command in $date for file naming
# Explanation of the printf command: https://stackoverflow.com/questions/30098992/what-does-printf-v-do
# logfile=$current_dir/"delete_files_${date}.log" # TODO this will write to the same file each time
logfile=$current_dir/"delete_files.log" # TODO this will write to the same file each time
# Exit if we don't have any command line arguments
if [ $# -eq 0 ]
then
echo "No arguments supplied, exiting without deleting" >> $logfile
echo "No arguments supplied, exiting without deleting" # write to stdout for testing
exit 1
fi
run_date=`date`
echo "###########################################" >> $logfile
echo "Calling clean_topsStack_files.sh at: $run_date" >> $logfile
if [ "$delete_files_bool" = "true" ];
then
echo 'Deleting files - NOT A DRY RUN' # Echo to stdout for testing
echo 'Deleting files - NOT A DRY RUN' >> $logfile
else
echo 'Dry run, not deleting files'
echo 'Dry run, not deleting files' >> $logfile
fi
########## Regex patterns of paths of files to delete
# Note - need to escape .
# - .* matches any number of characters
# For each of these we need an entry in the 'case' setup below to read from command line
## After generate_burst_igram has run
# Delte coarse interferograms
coarse_path='\./coarse_interferograms/20.*_20.*/overlap/IW.*/int_.*\.int'
# Delete regular burst slcs (not the ones we need for ionosphere)
# These also go in to the merged SLCs via .vrt files, and we need the slcs to calculate the coherence
# Delete after run_15_filter_coherence
# We want to just match the numbered bursts, not upper/lower e.g. the burst_03_upper.slc
# NB - secondarys, not secondaries
burst_slc_path='\./coreg_secondarys/20.*/IW.*/burst_[0-9][0-9]\.slc'
# After run_14_merge_burst_igram delete burst igrams
burst_igram_path='\./interferograms/20.*_20.*/IW.*/fine_.*\.int'
## After computeIon has run
# Delete upper and lower burst SLCs
# Think we can do this after run_18_generateIgram_ion
ion_burst_slc_path='\./coreg_secondarys/20.*/IW.*/burst_[0-9][0-9]_(lower|upper)\.slc'
# Delete burst interferograms in ion
# Think we can do this after run_19_merge_BurstsIon
ion_burst_igram_path='\./ion/20.*_20.*/(lower|upper)/interferograms/IW.*/fine_.*\.int'
# Delete split band merged interferograms
# Do this after run_22_computeIon
ion_split_igram_path='\./ion/20.*_20.*/(lower|upper)/merged/.*\.(int|cor|unw|conncomp)'
# Looks like we don't have .cor files in 'upper'
# Get errors like: ls: cannot access ./ion/20*_20*/upper/merged/*.cor: No such file or directory
# Geom beference burst files
# Remove after run_12_merge_reference_secondary (I think?)
geom_reference_path='\./geom_reference/IW.*/.*\.rdr'
# Coregistered secondarys overlap files
# Experiment with removing after run_08_timeseries_misreg
coreg_overlap_path='\./coreg_secondarys/20.*/overlap/IW.*/.*\.(slc|off)'
# Coregistered secondarys azimuth and range offset file
# I think these are generated by run_17_subband_and_resamp
# Remove after run_22_computeIon (although can probably do this earlier)
coreg_offset_path='\./coreg_secondarys/20.*/IW.*/.*\.off'
## ESD files
# Try erasing these after run_08_timeseries_misreg
# esd_bin_path='./ESD/20*_20*/IW*/freq*bin'
# esd_int_path='./ESD/20*_20*/IW*/overlap*int
esd_path='\./ESD/20.*_20.*/IW.*/.*\.(int|bin|cor|off)'
# TODO if we get one of the files wrong above, we can end up calling du on the whole directory rather than just the files we're trying to delete
# Leads to bad results for total size of deleted files
# idiomatic parameter and option handling in sh
# Taken from here: https://superuser.com/questions/186272/check-if-any-of-the-parameters-to-a-bash-script-match-a-string/186279
del_paths=()
echo "Deleting:" >> $logfile
while test $# -gt 0
do
case "$1" in
--coarse_igram) del_paths+=("$coarse_path"); echo "- coarse interferograms" >> $logfile
;;
--burst_slc) del_paths+=("$burst_slc_path"); echo "- burst slcs" >> $logfile
;;
--burst_igram) del_paths+=("$burst_igram_path"); echo "- burst interferograms" >> $logfile
;;
--ion_burst_slc) del_paths+=("$ion_burst_slc_path"); echo "- ionosphere burst slcs" >> $logfile
;;
--ion_burst_igram) del_paths+=("$ion_burst_igram_path"); echo "- ionosphere burst interferograms" >> $logfile
;;
--ion_split_igram) del_paths+=("$ion_split_igram_path"); echo "- ionosphere split-band interferograms" >> $logfile
;;
--geom_reference) del_paths+=("$geom_reference_path"); echo "- geometry reference burst files" >> $logfile
;;
--coreg_offset) del_paths+=("$coreg_offset_path"); echo "- azimuth and range offset files" >> $logfile
;;
--coreg_overlap) del_paths+=("$coreg_overlap_path"); echo "- overlap slcs and offset files" >> $logfile
;;
--esd) del_paths+=("$esd_path"); echo "- ESD files" >> $logfile
;;
--all) del_paths+=("$coarse_path" "$burst_slc_path" "$burst_igram_path" "$ion_burst_slc_path" "$ion_burst_igram_path" "$ion_split_igram_path" "$geom_reference_path" "$coreg_offset_path" "$esd_path"); echo "- All deletion options activated" >> $logfile
;;
*) echo "argument $1 not understood"; # Keep running even if one argument is wrong - #TODO might want to terminate the whole run if we make a mistake here
;;
esac
shift
done
echo "###########################################" >> $logfile
# Alias find command using regular expressions
find_regex='find ./ -regextype posix-extended -regex'
# Loop over del_paths and delete the files for each path
size_arr=()
size_byte_arr=()
for path in "${del_paths[@]}"
do
# Enclose "$path" to prevent wildcards being expanded
# echo "$path"
echo "###########################################" >> $logfile
echo "Deleting files matching regex: $path" >> $logfile
echo "###########################################" >> $logfile
echo "File list:" >> $logfile
$find_regex $path >> $logfile
# eval ls $path >> $logfile
# size=$($find_regex $path | xargs du -ch | tail -1 | cut -f 1) # xargs divides arguments into batches, so you only get totals by batch
# size_bytes=$($find_regex $path | xargs du -c | tail -1 | cut -f 1) # Get bytes for later summing
# TODO - stat fails if we run this on files that we've already deleted
echo 'Calculating sizes of deleted files at: ' $(date) >> $logfile # This can be time consuming for large numbers of files
size_bytes=$($find_regex $path | xargs stat -c '%s' |awk '{total=total+$1}END{total = total; print total}') # Get size in bytes
echo '- Done calculating sizes of deleted files at: ' $(date) >> $logfile
size=$(echo "$size_bytes / (1024*1024*1024*1024)" | bc -l ) # Scale into TB #TODO there must be a neater way than this
# Do deletions
if [ "$delete_files_bool" = "true" ];
then
echo 'Deleting at: ' $(date) >> $logfile
# TODO deleting large numbers of files like this is slow - explore other options
$find_regex $path | xargs rm -f # Need 'find' and 'xargs' to deal with very large numbers of files
echo '- Finished deleting at: ' $(date) >> $logfile
fi
echo "###########################################" >> $logfile
if [ "$delete_files_bool" = "true" ];
then
# TODO could automatically scale into TB vs GB depending on the size
printf "Size deleted: %.3f TB \n" $size >> $logfile
else
printf "Size that would be deleted: %.3f TB \n" $size >> $logfile
fi
echo "###########################################" >> $logfile
size_arr+=("$size") # Append the size of deleted files to an array
size_bytes_arr+=($size_bytes) # Append as number
done
echo "###########################################" >> $logfile
# Get size of remaining files
echo 'Calculating sizes of all files at: ' $(date) >> $logfile
size_remain=$(eval du -ch --max-depth=1 . | tail -1 | cut -f 1) # Total amout of data we have left,
echo '- Done calculating sizes of all files at: ' $(date) >> $logfile
# Get total size of deleted files in bytes
# From here: https://stackoverflow.com/questions/13635293/how-can-i-find-the-sum-of-the-elements-of-an-array-in-bash
sum=$(IFS=+; echo "$((${size_bytes_arr[*]}))")
# Calculate time for all erasing (make sure this is at the end of the code)
# If deletion is slow, we might want to start it as a separate job that doesn't hold up subsequent jobs
step_end=`date +%s`
step_time=$( echo "$step_end - $step_start" | bc -l )
Elapsed="$(($step_time / 3600))hrs $((($step_time / 60) % 60))min $(($step_time % 60))sec"
# Output stats
printf "##################################\n" >> $logfile
echo "Total time: $Elapsed" >> $logfile
# Convert into GB
if [ "$delete_files_bool" = "true" ];
then
printf "Total deleted: %.3f TB \n" $(echo "$sum / (1024*1024*1024*1024)" | bc -l) >> $logfile
else
printf "Total that would be deleted: %.3f TB \n" $(echo "$sum / (1024*1024*1024*1024)" | bc -l) >> $logfile
fi
echo "Size remaining: $size_remain" >> $logfile # We repeat this for every deletion - lets move it outside
printf "##################################\n" >> $logfile
# Add a reminder if we're doing a dry run
if [ "$delete_files_bool" = "false" ];
then
echo 'Reminder: dry run, not deleting files' >> $logfile
fi
# Back to run_files
cd $current_dir
echo 'Finished'
echo 'Finished' >> $logfile
echo "###########################################" >> $logfile
echo "###########################################" >> $logfile