-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathprocesscpcfiles.py
78 lines (70 loc) · 3.86 KB
/
processcpcfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/python3.5
#Author: Sasan Bahadaran
#Date: 3/31/17
#Organization: Commerce Data Service
#Description: This script is meant to parse files from
# http://www.cooperativepatentclassification.org/cpcSchemeAndDefinitions/Bulk.html
# in order to output cpc codes and related information into a structured csv file.
import sys, os, csv
from lxml import etree
def parseXML(filetoproc):
try:
doccontent = []
parser = etree.XMLParser(remove_pis=True)
tree = etree.parse(filetoproc, parser=parser)
root = tree.getroot()
for item in root.xpath('//definitions/definition-item'):
cpc_content = []
main_cpc_code = item.find('classification-symbol').text
cpc_content.append(main_cpc_code)
title_text = ''.join(item.xpath('definition-title/descendant-or-self::*/text()'))
cpc_content.append(title_text)
title_text_codes = '|'.join(item.xpath('definition-title//class-ref/text()'))
cpc_content.append(title_text_codes)
informative_ref_text = ''.join(item.xpath('references/informative-references/section-body/table/table-row/table-column//paragraph-text/text()'))
cpc_content.append(informative_ref_text)
informative_ref_cpc_codes = '|'.join(item.xpath('references/informative-references/section-body/table/table-row/table-column//paragraph-text/class-ref/text()'))
cpc_content.append(informative_ref_cpc_codes)
special_rules_text = ''.join(item.xpath('special-rules/section-body//paragraph-text/text()'))
cpc_content.append(special_rules_text)
special_rules_cpc_codes = '|'.join(item.xpath('special-rules/section-body//paragraph-text/class-ref/text()'))
cpc_content.append(special_rules_cpc_codes)
glossary_of_terms = ''.join(item.xpath('glossary-of-terms/section-body/table/table-row/table-column//paragraph-text/text()'))
cpc_content.append(glossary_of_terms)
doccontent.append(cpc_content)
return doccontent
except IOError as e:
print('I/O error({0}): {1}'.format(e.errno,e.strerror))
def writeResults(content, fname):
try:
if(os.path.isfile(fname)):
print('file: {} exists!'.format(fname))
else:
with open(fname,'w') as outfile:
writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
writer.writerow(('cpc_code', 'title', 'title_cpc_codes', 'informative_references', \
'informative_references_cpc_codes', 'special_rules', 'special_rules_cpc_codes',\
'glossary_of_terms'))
writer.writerows(content)
except IOError as e:
print('I/O error({0}): {1}'.format(e.errno,e.strerror))
#change extension of file name to specified extension
def changeExt(fname, ext):
seq = (os.path.splitext(fname)[0], ext)
return '.'.join(seq)
if __name__ == '__main__':
scriptpath = os.path.dirname(os.path.abspath(__file__))
for fdir in os.listdir(os.path.join(scriptpath, 'xml_files')):
if os.path.isdir(os.path.join(scriptpath, 'xml_files', fdir)):
print(fdir)
for filename in os.listdir(os.path.join(scriptpath, 'xml_files', fdir)):
if filename.endswith('.xml'):
fname = os.path.join(scriptpath, 'xml_files', fdir, filename)
print('FILE: {}'.format(fname))
output_fname = os.path.join(scriptpath, 'xml_files', fdir, (os.path.basename(fname)).split('-')[2])
output_fname = changeExt(output_fname, 'csv')
if (os.path.isfile(output_fname)):
print('FILE: {} already exists!'.format(output_fname))
else:
content = parseXML(fname)
writeResults(content, output_fname)