-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathalterptabdocs.py
70 lines (61 loc) · 3.02 KB
/
alterptabdocs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python 3.5
#Author: Sasan Bahadaran
#Date: 8/11/16
#Organization: Commerce Data Service
#Description: This script alters existing JSON files created from PTAB data to format
#fields properly and add any fields necessary for the new Solr architecture
import sys, json, os, glob, time
import dateutil.parser
from datetime import datetime
#get field(date) and return in UTC format
def convertToUTC(x, field):
date = str(x.pop(field))
if date != '0001-01-01T00:00:00Z':
date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
date = time.mktime(date.timetuple())
else:
date = 978325200.0
return date
def convertToReadableDate(date):
date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
date = datetime.strftime(date, '%m/%d/%Y')
return date
#this function contains the code for altering the existing
#JSON structure to format date fields, add type field, and
#add truncated text field
def alterJSON(fname):
try:
with open(os.path.abspath(fname)) as fd:
print("--Reading file: "+fname)
doc = json.loads(fd.read())
records = doc['main']['DATA_RECORD']
for x in records:
last_modified_ts = x['LAST_MODIFIED_TS']
x['LAST_MODIFIED_TS'] = convertToUTC(x, 'LAST_MODIFIED_TS')
x['derived_LAST_MODIFIED_TS'] = convertToReadableDate(last_modified_ts)
x['derived_PATENT_ISSUE_DT'] = convertToReadableDate(x['PATENT_ISSUE_DT'])
x['PATENT_ISSUE_DT'] = convertToUTC(x, 'PATENT_ISSUE_DT')
x['derived_DECISION_MAILED_DT'] = convertToReadableDate(x['DECISION_MAILED_DT'])
x['DECISION_MAILED_DT'] = convertToUTC(x, 'DECISION_MAILED_DT')
x['derived_PRE_GRANT_PUBLICATION_DT'] = convertToReadableDate(x['PRE_GRANT_PUBLICATION_DT'])
x['PRE_GRANT_PUBLICATION_DT'] = convertToUTC(x, 'PRE_GRANT_PUBLICATION_DT')
x['derived_APPLICANT_PUB_AUTHORIZATION_DT'] = convertToReadableDate(x['APPLICANT_PUB_AUTHORIZATION_DT'])
x['APPLICANT_PUB_AUTHORIZATION_DT'] = convertToUTC(x, 'APPLICANT_PUB_AUTHORIZATION_DT')
x['derived_doc_date'] = convertToReadableDate(x['doc_date'])
x['doc_date'] = convertToUTC(x, 'doc_date')
x['type'] = 'ptab'
x['derived_textdata'] = x['textdata'][:32000]
#transform output to json and save to file with same name
fname = os.splitext(fname)[0]+'_altered'+os.splitext(fname)[1]
with open(fname,'w') as outfile:
json.dump(doc,outfile)
print("--Alteration of file complete")
except IOError as e:
print("I/O error({0}): {1}".format(e.errno,e.strerror))
except:
print("Unexpected error:", sys.exc_info()[0])
raise
if __name__ == '__main__':
scriptpath = os.path.dirname(os.path.abspath(__file__))
for filename in glob.iglob(os.path.join(scriptpath,'files/PTAB','jsonfiles/*.json')):
alterJSON(filename)