-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathallas-dir-to-bucket
executable file
·429 lines (355 loc) · 12.8 KB
/
allas-dir-to-bucket
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
#!/usr/bin/env bash
# Script expects a sourced Openstack project (openrc) file and configured
# rclone remote for Swift
# Script expects rclone remote for swift object store in $RCLONE_DESTINATION
# Can configure maximum simultaneous uploads in $MAX_PROCESSES environment
# variable – recommended is amount of threads
# $1 is the location to be copied, $2 is the destination container
# TODO: Check if 500k file limit in bucket is exceeded
#read static variables
inst_root="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source $inst_root/a_env_conf
source $inst_root/allas-lib
RCLONE_DESTINATION=${storage_server}
MAX_PROCESSES=4
input=""
bucket_name=""
user=$(whoami)
print_help=0
check_md5=0
force=0
sdc=0
tmp_root="$(pwd)"
if [[ $# -lt 1 ]]; then
print_help=1
fi
#Process command line
while [[ $# -ge 1 ]]
do
case "$1" in
'--bucket' | '-b' )
bucket_name="$2"
#Remove the trailing / if it exist
bucket_name=$(remove_slash_from_ends $bucket_name)
shift
shift
;;
'--directory' | '-d' )
input="$2"
#Remove the trailing / if it exist
input=$(remove_slash_from_ends $input)
shift
shift
;;
'--stream' | '-s' )
export MAX_PROCESSES=$2
shift
shift
;;
'--user' | '-u' )
user=("$2")
shift
shift
;;
'-h' | '--help' )
print_help=1
shift
;;
'--md5' | '-m' )
check_md5=1
shift
;;
'--force' | '-f' )
force=1
shift
;;
'--sdc' | '-S' )
if [[ $(which sd-lock-util 2> /dev/null | wc -l ) -ne 1 ]];then
echo ""
echo "sd-lock-util is not available!"
echo "Please install sd-lock-util if you want to use SD Connect based encryption."
echo " https://github.com/CSCfi/sd-lock-util/"
exit 1
fi
sdc=1
shift
;;
*)
if [[ $input == "" ]]; then
input="$1"
if [[ ! -e $input ]] ; then
echo "File or directory $input does not exist!"
exit 1
fi
else
if [[ $bucket_name == "" ]]; then
bucket_name="$1"
else
echo "Unknown option: $1"
exit 1
fi
fi
shift # No more switches
;;
esac
done
if [[ $print_help -eq 1 ]]; then
cat <<EOF
DESCRIPTION
This tool is used to upload on content of a directory or a file to a bucket in
Allas. Upload is done with rclone.
The upload is done so that the content between the source directory in your local
computer and the target bucket in Allas is synchronized. This means that in addition to
copying data from the local directory to the bucket in Allas, the objects
in Allas, that don't match files in the local directory, are removed.
Because of that you should mainly use this tool to upload data to
a new empty bucket in Allas. If the target bucket does not exist, it will be
automatically created.
In Allas, files larger than 5 GiB will be stored as 5GiB segments.
This tool utilizes this segmentation to speed up the upload process
by uploading several segments of a large file simultaneously.
Smaller files will be uploaded using normal rclone copy command.
Data can be sorted in SD Connect compatible encrypted format if sd-lock-util command
is avaialable and the SD Connect compatible Allas connection is provided.
USAGE
The basic syntax of the command is:
allas-dir-to-bucket source-directory target-bucket
You can also define source directory and target bucket with
command line options.
The files will be uploaded to the defined bucket and the object names
will include the source directory name. For example if we have a directory
"data1" containing files sample_1.dat and sample_2.dat, then command:
allas-dir-to-bucket data1 p20001234_backup
Will create bucket:
p20001234_backup
That contains objects:
data1/sample_1.dat
data1/sample_2.dat
COMMAND LINE OPTIONS
allas-dir-to-bucket command line options:
-d, --directory' <dir_name> Name of the directory or file to be uploaded to Allas.
-b, --bucket <bucket_name> Define a name of the bucket into
which the data is uploaded.
-s, --stream <number> Number of simultaneous upload streams.
Default 4.
-m, --md5 Check md5 sums of uploaded segments.
-u, --user <csc-user-name> Define username liked to the data to be uploaded
(default: current username).
-S, --sdc Use SD Connect based encryption in upload.
-h, --help Print this help.
EOF
fi
if [[ -z "$OS_PASSWORD" ]]; then
echo "OS_PASSWORD not defined!"
echo "allas-dir-to-bucket needs to have your Allas password stored in an environment variable."
echo "Please setup you Allas connection with a command that sets this variable."
echo ""
echo " In Puhti and Mahti use:"
echo " allas-conf -k"
echo ""
echo " In Other servers use: "
echo " source allas_conf -k -u <your-csc-user-account>"
exit 1
fi
#check SD Connect settings if that is used
if [[ $sdc -eq 1 ]]; then
sdc_check=$(sd-lock-util pubkey | grep -c "BEGIN CRYPT4GH PUBLIC KEY")
if [[ $sdc_check -ne 1 ]]; then
echo "Connection to SD Connect service is not working."
echo "Please open or refresh the connection"
echo "by running command:"
echo ""
echo " source $allas_conf_path -k --sdc"
exit 1
fi
fi
if [[ $input == "" ]]; then
echo "Input directory not defined."
exit 1
fi
# In case if SD Connect encryption, check that there is enough space for temporary files
if [[ sdc -eq 1 ]]; then
input_size=$(du -s "$input" | awk '{print $1}')
space_avail=$(df --output=size "$input" | tail -1)
if [[ $input_size -gt $space_avail ]]; then
echo "Not enough space for temporary files!"
echo "$input_size needed."
echo "$space_avail available."
exit 1
fi
fi
if [[ $bucket_name == "" ]]; then
echo "Target bucket not defined."
exit 1
fi
#Check if rclone is available
if [[ $(which rclone 2> /dev/null | wc -l ) -ne 1 ]];then
echo ""
echo "rclone is not available!"
echo "Please install rclone."
exit 1
fi
#Check if jq is available
if [[ $(which jq 2> /dev/null | wc -l ) -ne 1 ]];then
echo ""
echo "jq is not available!"
echo "Please install jq."
exit 1
fi
#Check connection
check_swift_connection
token_time=$(date +%s)
#Check bucket
#echo "rclone size allas:$bucket_name "
bucket_check=$(rclone size ${storage_server}:$bucket_name 2> /dev/null | wc -l)
if [[ $bucket_check -lt 2 ]]; then
bucket_elsewhere=$(rclone size ${storage_server}:$bucket_name 2>&1 | grep -c "forbidden" )
if [[ $bucket_elsewhere -ne 0 ]]; then
echo ""
echo "Bucket name $bucket_name is already used by some other project and you don't have access to it."
echo "Please use some other bucket name"
exit 1
fi
rclone mkdir ${storage_server}:$bucket_name
else
if [[ $force -eq 0 ]]; then
echo ""
echo "Bucket $bucket_name already exists!"
echo "Do you want to use this bucket [yes/no]"
read ans
else
ans="y"
fi
if [[ $ans == "y" || $ans == "yes" ]]; then
old_objects=$(rclone size ${storage_server}:${bucket_name}/${input} | grep "objects:" | awk '{print $NF}')
old_size=$(rclone size ${storage_server}:${bucket_name}/${input} | grep "size:" | awk '{print $3" "$4}')
if [[ $old_objects -gt 0 ]]; then
echo ""
echo "Are you really sure?"
echo "Allas location $bucket_name/${input} contains $old_objects objects "
echo "that include $old_size of data."
echo "This data will be overwritten or removed by this upload process."
echo "Do you want to use Allas location ${bucket_name}/${input} [yes/no]"
read ans2
if [[ $ans2 == "y" || $ans2 == "yes" ]]; then
echo "Target bucket: ${bucket_name}"
echo "Target directory: ${input}"
echo ""
echo "Removing old data"
rclone delete ${storage_server}:${bucket_name}/${input}
rclone delete ${storage_server}:${bucket_name}_segments/${input}
else
exit
fi
fi
else
exit
fi
fi
if [[ $sdc -eq 1 ]]; then
## Make encrypted copy of the data
original_location=$(pwd)
tmpdir="$tmp_root/adtb_$$/"
echo "Encrypting the temporary copy."
sd-lock-util lock --container ${bucket_name} --no-content-upload --progress $input
echo ""
echo "Encryption ready."
mkdir -p $tmpdir
echo "Creating directory strecture inside $tmpdir"
IFS=$(echo -en "\t\n\b")
for ndir in $(find "$input" -type d )
do
mkdir -p "${tmpdir}/${ndir}"
done
echo "Copyind encrypted files to $tmpdir"
for ndfile in $(find "$input" -name "*.c4gh" )
do
mv "$ndfile" "${tmpdir}/${ndfile}"
done
cd "$tmpdir"
fi
echo "Uploading data from directory: $input to bucket: $bucket_name"
# Get files that are larger than 5GiB
FILES_LARGE=$(find "$input" -type f -size +5368709120c)
num_large_files=$(echo $FILES_LARGE | wc -w )
echo "Directory $input contains $num_large_files files that are larger than 5 GiB"
# Upload large files – the speed benefit varies, but seems consistently faster
# in testing
fn=0
for file in $FILES_LARGE; do
(( fn = fn + 1 ))
filesize=$(stat --printf="%s" $file)
(( filesize_mb = filesize / 1000000 ))
(( filesize_gb = filesize_mb / 1024 ))
start_time=$(date +%s)
echo ""
echo "${fn}/${num_large_files} Uploading file $file ${filesize_gb}GiB."
(( token_age = start_time - token_time ))
if [[ $token_age -gt 3600 ]]; then
check_swift_connection
token_time=$start_time
fi
# echo "start_time $start_time"
# echo "concurrent_rclone_rcat.sh $file $bucket_name"
# Try 3 times to upload a large file
i=0
success=0
while [ $success -ne 1 ]; do
concurrent_rclone_rcat.sh $file $bucket_name $check_md5
if [[ $? -ne 0 ]]; then
echo "Data upload failed!"
fi
end_time=$(date +%s)
(( duration = end_time - start_time ))
(( speed = filesize_mb / duration ))
#check size of uploaded file
allas_size=$(rclone ls $RCLONE_DESTINATION:$bucket_name/$file | awk '{print $1}')
allas_segments_size=$(rclone ls $RCLONE_DESTINATION:${bucket_name}_segments/${file}| awk '{ a = a + $1 }END{print a}')
#echo allas_size $allas_size
#echo allas_segments_size $allas_segments_size
#echo filesize $filesize
if [[ $filesize -eq $allas_size && $allas_size -eq $allas_segments_size ]]; then
echo "Upload was done in ${duration}s. ( $speed MB/s )"
success=1
else
echo "Data upload failed!"
(( i = i + 1 ))
if [[ $i -gt 2 ]]; then
echo "Too many failed upload attempts"
failed_uploads=(${failed_uploads[@]} "$file")
success=1
else
echo "Retrying to upload $file"
fi
if [[ $file != "" ]]; then
rclone delete $RCLONE_DESTINATION:$bucket_name/$file
rclone delete $RCLONE_DESTINATION:${bucket_name}_segments/$file
fi
fi
done
done
# Upload the rest
check_swift_connection
echo ""
echo "Starting to upload files that are smaller than 5 GB."
ftype=$(file $input | awk '{print $NF}')
if [[ $ftype == "directory" ]] ; then
rclone --transfers=$MAX_PROCESSES --swift-no-chunk copy --ignore-existing --progress $input $RCLONE_DESTINATION:$bucket_name/$input
else
rclone copyto --ignore-existing --progress $input $RCLONE_DESTINATION:$bucket_name/$input
fi
echo "DONE uploading ${1}"
if [[ ${#failed_uploads[@]} -gt 0 ]]; then
echo "NOTE! Not all files were successfully uploaded!"
echo "Upload failed for following files:"
for value in "${failed_uploads[@]}"
do
echo $value
done
fi
exit
#Clean in case of SD Connect uplaod
if [[ $sdc --eq 1 ]]; then
cd "$original_location"
rm -rf "$tmp_root/adtb_$$/"
fi