Skip to content

Commit

Permalink
#387 corrected some bugs with the XML parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
ecwood committed Jul 27, 2024
1 parent f5c7274 commit 6db935f
Showing 1 changed file with 51 additions and 43 deletions.
94 changes: 51 additions & 43 deletions misc-tools/owlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

KEY_TAG = "tag"
KEY_ATTRIBUTES = "attributes"
KEY_TEXT = "text"
KEY_TEXT = "ENTRY_TEXT"
KEY_TYPE = "type"

IGNORED_ATTRIBUTES = ["xml:lang"]
Expand Down Expand Up @@ -121,7 +121,8 @@ def convert_line(line):
# Categorize the type of line
line_type = str()
out = dict()
if tag == COMMENT or tag in OUTMOST_TAGS_SKIP:

if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP:
line_type = LINE_TYPE_IGNORE
else:
start_tag_exists = (tag != str())
Expand Down Expand Up @@ -160,50 +161,60 @@ def convert_line(line):
return out


def convert_nest(nest, index, working_dict):
if index >= len(nest):
return working_dict
def convert_nest(nest, start_index):
nest_dict = dict()
curr_index = start_index

while curr_index < len(nest):
element = nest[curr_index]
line_type = element[KEY_TYPE]
line_tag = element[KEY_TAG]
line_text = element.get(KEY_TEXT, None)
line_attributes = element.get(KEY_ATTRIBUTES, None)

if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
if line_tag not in nest_dict:
nest_dict[line_tag] = list()

element = nest[index]
line_type = element[KEY_TYPE]
line_tag = element[KEY_TAG]
line_text = element.get(KEY_TEXT, None)
line_attributes = element.get(KEY_ATTRIBUTES, None)
converted_nest, ret_index = convert_nest(nest, curr_index + 1)

if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
working_dict[line_tag] = dict()
if line_attributes is not None:
for attribute in line_attributes:
converted_nest[attribute] = line_attributes[attribute]

converted_nest = convert_nest(nest, index + 1, dict())
working_dict[line_tag] = converted_nest
nest_dict[line_tag].append(converted_nest)

if line_type == LINE_TYPE_START_NEST_WITH_ATTR:
working_dict[line_tag][KEY_ATTRIBUTES] = line_attributes
curr_index = ret_index + 1
continue

if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
if line_tag not in working_dict:
working_dict[line_tag] = list()
if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
if line_tag not in nest_dict:
nest_dict[line_tag] = list()

curr_dict = dict()
curr_dict = dict()

if line_text is not None:
curr_dict[KEY_TEXT] = line_text
if line_text is not None:
curr_dict[KEY_TEXT] = line_text

if line_attributes is not None:
for attribute in line_attributes:
curr_dict[attribute] = line_attributes[attribute]
if line_attributes is not None:
for attribute in line_attributes:
curr_dict[attribute] = line_attributes[attribute]

working_dict[line_tag].append(curr_dict)
nest_dict[line_tag].append(curr_dict)

convert_nest(nest, index + 1, working_dict)
curr_index += 1
continue

return working_dict
if line_type in [LINE_TYPE_END_NEST]:
return nest_dict, curr_index

return nest_dict, curr_index


def divide_into_lines(input_file_name):
curr_str = ""
curr_nest = list()
curr_nest_tag = str()
curr_nest_tags = list() # Treating it as a stack

with open(input_file_name) as input_file:
for line in input_file:
Expand All @@ -219,29 +230,26 @@ def divide_into_lines(input_file_name):

if letter == '>' and (next_letter == '<' or next_letter == ""):
# Only return if nesting
# print(curr_str)
line_parsed = convert_line(curr_str)

tag = line_parsed.get(KEY_TAG, None)
assert tag != KEY_TEXT # This could cause a massive conflict, but it is unlikely
line_type = line_parsed.get(KEY_TYPE, None)
attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys()

if curr_nest_tag == str():
if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
curr_nest_tag = tag
curr_nest.append(line_parsed)
elif line_type != LINE_TYPE_IGNORE:
print("THIS VERSION")
print(json.dumps(line_parsed, indent=4)) # replacement for processing right now
else:
if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag:
print(json.dumps(curr_nest, indent=4)) # replacement for processing right now
nest_dict = convert_nest(curr_nest, 0, dict())
if line_type != LINE_TYPE_IGNORE:
curr_nest.append(line_parsed)

if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
curr_nest_tags.append(tag)
elif line_type == LINE_TYPE_END_NEST:
popped_curr_nest_tag = curr_nest_tags.pop()
assert popped_curr_nest_tag == tag
if len(curr_nest_tags) == 0:
nest_dict, _ = convert_nest(curr_nest, 0)
print(json.dumps(nest_dict, indent=4))
curr_nest = list()
curr_nest_tag = str()
else:
curr_nest.append(line_parsed)

curr_str = ""

Expand Down

0 comments on commit 6db935f

Please sign in to comment.