12
12
from invenio_records .dictutils import dict_lookup
13
13
14
14
from ...datastreams .errors import TransformerError
15
- from ...datastreams .readers import SimpleHTTPReader
15
+ from ...datastreams .readers import SimpleHTTPReader , BaseReader
16
16
from ...datastreams .transformers import BaseTransformer
17
17
from ...datastreams .writers import ServiceWriter
18
+ import boto3
19
+ from flask import current_app
20
+ from datetime import datetime
21
+ from datetime import timedelta
22
+ import tarfile
23
+ import io
24
+ from concurrent .futures import ThreadPoolExecutor
25
+
26
+ class OrcidDataSyncReader (BaseReader ):
27
+ """ORCiD Data Sync Reader."""
28
+
29
+ def _iter (self , fp , * args , ** kwargs ):
30
+ """."""
31
+ raise NotImplementedError (
32
+ "OrcidDataSyncReader downloads one file and therefore does not iterate through items"
33
+ )
34
+
35
+ def read (self , item = None , * args , ** kwargs ):
36
+ """Downloads the ORCiD lambda file and yields an in-memory binary stream of it."""
37
+
38
+ path = current_app .config ["VOCABULARIES_ORCID_FOLDER" ]
39
+ date_format = '%Y-%m-%d %H:%M:%S.%f'
40
+ date_format_no_millis = '%Y-%m-%d %H:%M:%S'
41
+
42
+ s3client = boto3 .client ('s3' , aws_access_key_id = current_app .config ["VOCABULARIES_ORCID_ACCESS_KEY" ], aws_secret_access_key = current_app .config ["VOCABULARIES_ORCID_SECRET_KEY" ])
43
+ response = s3client .get_object (Bucket = 'orcid-lambda-file' , Key = 'last_modified.csv.tar' )
44
+ tar_content = response ['Body' ].read ()
45
+
46
+ days_to_sync = 60 * 9
47
+ last_sync = datetime .now () - timedelta (minutes = days_to_sync )
48
+ # TODO: Do we want to use last_run to kee keep track of the last time the sync was run?
49
+ # Might not be ideal as it seems the file is updated at midnight
50
+
51
+ # last_ran_path = os.path.join(path, 'last_ran.config')
52
+ # if os.path.isfile(last_ran_path):
53
+ # with open(last_ran_path, 'r') as f:
54
+ # date_string = f.readline()
55
+ # last_sync = datetime.strptime(date_string, date_format)
56
+
57
+ # with open(last_ran_path, 'w') as f:
58
+ # f.write(datetime.now().strftime(date_format))
59
+
60
+
61
+ def process_file (fileobj ):
62
+ file_content = fileobj .read ().decode ('utf-8' )
63
+ orcids = []
64
+ for line in file_content .splitlines ()[1 :]: # Skip the header line
65
+ elements = line .split (',' )
66
+ orcid = elements [0 ]
67
+
68
+ last_modified_str = elements [3 ]
69
+ try :
70
+ last_modified_date = datetime .strptime (last_modified_str , date_format )
71
+ except ValueError :
72
+ last_modified_date = datetime .strptime (last_modified_str , date_format_no_millis )
73
+
74
+ if last_modified_date >= last_sync :
75
+ orcids .append (orcid )
76
+ else :
77
+ break
78
+ return orcids
79
+
80
+ orcids_to_sync = []
81
+ with tarfile .open (fileobj = io .BytesIO (tar_content )) as tar :
82
+ for member in tar .getmembers ():
83
+ f = tar .extractfile (member )
84
+ if f :
85
+ orcids_to_sync .extend (process_file (f ))
86
+
87
+ def fetch_orcid_data (orcid_to_sync , bucket ):
88
+ suffix = orcid_to_sync [- 3 :]
89
+ key = f'{ suffix } /{ orcid_to_sync } .xml'
90
+ try :
91
+ file_response = s3client .get_object (Bucket = bucket , Key = key )
92
+ return file_response ['Body' ].read ()
93
+ except Exception as e :
94
+ # TODO: log
95
+ return None
96
+
97
+ with ThreadPoolExecutor (max_workers = 40 ) as executor : # TODO allow to configure max_workers / test to use asyncio
98
+ futures = [executor .submit (fetch_orcid_data , orcid , current_app .config ["VOCABULARIES_ORCID_SUMMARIES_BUCKET" ]) for orcid in orcids_to_sync ]
99
+ for future in futures :
100
+ result = future .result ()
101
+ if result is not None :
102
+ yield result
18
103
19
104
20
105
class OrcidHTTPReader (SimpleHTTPReader ):
@@ -89,6 +174,7 @@ def _entry_id(self, entry):
89
174
90
175
VOCABULARIES_DATASTREAM_READERS = {
91
176
"orcid-http" : OrcidHTTPReader ,
177
+ "orcid-data-sync" : OrcidDataSyncReader ,
92
178
}
93
179
94
180
@@ -107,22 +193,32 @@ def _entry_id(self, entry):
107
193
DATASTREAM_CONFIG = {
108
194
"readers" : [
109
195
{
110
- "type" : "tar" ,
111
- "args" : {
112
- "regex" : "\\ .xml$" ,
113
- },
196
+ "type" : "orcid-data-sync" ,
114
197
},
115
198
{"type" : "xml" },
116
199
],
117
200
"transformers" : [{"type" : "orcid" }],
201
+ # "writers": [
202
+ # {
203
+ # "type": "names-service",
204
+ # "args": {
205
+ # "identity": system_identity,
206
+ # },
207
+ # }
208
+ # ],
118
209
"writers" : [
119
210
{
120
- "type" : "names-service " ,
211
+ "type" : "async " ,
121
212
"args" : {
122
- "identity" : system_identity ,
213
+ "writer" :{
214
+ "type" : "names-service" ,
215
+ "args" : {},
216
+ }
123
217
},
124
218
}
125
219
],
220
+ "batch_size" : 1000 , # TODO: current_app.config["VOCABULARIES_DATASTREAM_BATCH_SIZE"],
221
+ "write_many" : True ,
126
222
}
127
223
"""ORCiD Data Stream configuration.
128
224
0 commit comments