-
Notifications
You must be signed in to change notification settings - Fork 0
/
gluejob-chkcashed.py
131 lines (105 loc) · 6.55 KB
/
gluejob-chkcashed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import current_timestamp ,input_file_name
print "==========================================================================================="
print "==========================================================================================="
## @params: [TempDir, JOB_NAME]
args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME'])
args = getResolvedOptions(sys.argv, ['input_file_path'])
print "The input file path is -: ", args['input_file_path']
print "The TempDir is -: ", args['TempDir']
input_file = args['input_file_path']
sc = SparkContext()
glueContext = GlueContext(SparkContext.getOrCreate())
#glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
#job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "redshift-intg", table_name = "checks", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
#datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "redshift-intg", table_name = "checks", transformation_ctx = "datasource0")
dyF = glueContext.create_dynamic_frame.from_options(
's3',
{'paths': [ args['input_file_path'] ]},
'csv',
{'withHeader': True})
print "Full record count: ", dyF.count()
dyF.printSchema()
datasource1 = dyF.toDF().withColumn("load_file", input_file_name()).withColumn("load_datetime", current_timestamp())
print(type(dyF))
#datasource1.describe().show()
datasource2 = dyF.fromDF(datasource1, glueContext, "datasource2")
print " After GlueContext Show "
print(type(datasource2))
datasource2.printSchema()
datasource4 = Filter.apply(frame = datasource2, f = lambda x: x["load_file"] == input_file)
print "input_file_path from AWS Lambda -:" , input_file
#print "Filtered records count -: ", datasource4.count()
print "Filtered records count - : ", Filter.apply(frame = datasource4, f = lambda x: x["load_file"] == input_file).count()
## @type: ApplyMapping
## @args: [mapping = [("account", "string", "acct", "string"), ("check_amount", "string", "chk_amt", "decimal(20,2)"), ("account_status", "string", "acct_stts", "string"), ("transaction_date", "string", "tran_dt", "date"), ("check_number", "string", "chk_num", "string"), ("payee", "string", "payee", "string"), ("check_cashed_date", "string", "chk_cashed_dt", "date")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource4, mappings = [("load_file", "string", "load_file", "string"),("load_datetime", "timestamp", "load_datetime", "timestamp"), ("account", "string", "acct", "string"), ("check_amount", "string", "chk_amt", "decimal(20,2)"), ("account_status", "string", "acct_stts", "string"), ("transaction_date", "string", "tran_dt", "date"), ("check_number", "string", "chk_num", "string"), ("payee", "string", "payee", "string"), ("check_cashed_date", "string", "chk_cashed_dt", "date")], transformation_ctx = "applymapping1")
## @type: SelectFields
## @args: [paths = ["payee", "load_datetime", "tran_dt", "acct_stts", "chk_num", "load_file", "acct", "chk_amt", "chk_cashed_dt"], transformation_ctx = "selectfields2"]
## @return: selectfields2
## @inputs: [frame = applymapping1]
selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["payee", "load_datetime", "tran_dt", "acct_stts", "chk_num", "load_file", "acct", "chk_amt", "chk_cashed_dt"], transformation_ctx = "selectfields2")
## @type: ResolveChoice
## @args: [choice = "MATCH_CATALOG", database = "redshift-intg", table_name = "intg_payment_hist_kyriba_chkcashed", transformation_ctx = "resolvechoice3"]
## @return: resolvechoice3
## @inputs: [frame = selectfields2]
resolvechoice3 = ResolveChoice.apply(frame = selectfields2, choice = "MATCH_CATALOG", database = "redshift-intg", table_name = "intg_payment_hist_kyriba_chkcashed", transformation_ctx = "resolvechoice3")
## @type: ResolveChoice
## @args: [choice = "make_cols", transformation_ctx = "resolvechoice4"]
## @return: resolvechoice4
## @inputs: [frame = resolvechoice3]
resolvechoice4 = ResolveChoice.apply(frame = resolvechoice3, choice = "make_cols", transformation_ctx = "resolvechoice4")
## @type: DropNullFields
## @args: [transformation_ctx = "<transformation_ctx>"]
## @return: <output>
## @inputs: [frame = <frame>]
resolvechoice5 = DropNullFields.apply(frame = resolvechoice4, transformation_ctx = "resolvechoice5")
print(type(resolvechoice5))
resolvechoice5.printSchema()
## @type: DataSink
## @args: [database = "redshift-intg", table_name = "intg_payment_hist_kyriba_chkcashed", redshift_tmp_dir = TempDir, transformation_ctx = "datasink5"]
## @return: datasink5
## @inputs: [frame = resolvechoice4]
datasink5 = glueContext.write_dynamic_frame.from_catalog(frame = resolvechoice5, database = "redshift-intg", table_name = "intg_payment_hist_kyriba_chkcashed", redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink5")
job.commit()
print "==========================================================================================="
print "==========================================================================================="
print " File movement is started : "
import boto3
bucketname = "viac-intg-vdw-collection"
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucketname)
source = "kyriba/checks"
target = "kyriba/processed"
source_filename = ''
for obj in my_bucket.objects.filter(Prefix=source):
source_filename = (obj.key).split('/')[-1]
copy_source = {
'Bucket': bucketname,
'Key': obj.key
}
target_filename = "{}/{}".format(target, source_filename)
if source_filename in input_file :
print "Moving the input file - : ", input_file
print "Source File Name - : ", source_filename
print "copy_source File Name - : ", copy_source
print "bucketname File Name - : ", bucketname
s3.meta.client.copy(copy_source, bucketname, target_filename)
print "Target Folder is - S3://viac-intg-vdw-collection/kyriba/processed/ "
s3.Object(bucketname,obj.key).delete()
print "File is moved successfully at - : ", target_filename
print "==========================================================================================="
print "==========================================================================================="