#387 corrected some bugs with the XML parsing

RTXteam · Jul 27, 2024 · 6db935f · 6db935f
1 parent f5c7274
commit 6db935f
Showing 1 changed file with 51 additions and 43 deletions.
diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
@@ -17,7 +17,7 @@
 
 KEY_TAG = "tag"
 KEY_ATTRIBUTES = "attributes"
-KEY_TEXT = "text"
+KEY_TEXT = "ENTRY_TEXT"
 KEY_TYPE = "type"
 
 IGNORED_ATTRIBUTES = ["xml:lang"]
@@ -121,7 +121,8 @@ def convert_line(line):
 	# Categorize the type of line
 	line_type = str()
 	out = dict()
-	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP:
+
+	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP:
 		line_type = LINE_TYPE_IGNORE
 	else:
 		start_tag_exists = (tag != str())
@@ -160,50 +161,60 @@ def convert_line(line):
 	return out
 
 
-def convert_nest(nest, index, working_dict):
-	if index >= len(nest):
-		return working_dict
+def convert_nest(nest, start_index):
+	nest_dict = dict()
+	curr_index = start_index
+
+	while curr_index < len(nest):
+		element = nest[curr_index]
+		line_type = element[KEY_TYPE]
+		line_tag = element[KEY_TAG]
+		line_text = element.get(KEY_TEXT, None)
+		line_attributes = element.get(KEY_ATTRIBUTES, None)
+
+		if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
+			if line_tag not in nest_dict:
+				nest_dict[line_tag] = list()
 
-	element = nest[index]
-	line_type = element[KEY_TYPE]
-	line_tag = element[KEY_TAG]
-	line_text = element.get(KEY_TEXT, None)
-	line_attributes = element.get(KEY_ATTRIBUTES, None)
+			converted_nest, ret_index = convert_nest(nest, curr_index + 1)
 
-	if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
-		working_dict[line_tag] = dict()
+			if line_attributes is not None:
+				for attribute in line_attributes:
+					converted_nest[attribute] = line_attributes[attribute]
 
-		converted_nest = convert_nest(nest, index + 1, dict())
-		working_dict[line_tag] = converted_nest
+			nest_dict[line_tag].append(converted_nest)
 
-		if line_type == LINE_TYPE_START_NEST_WITH_ATTR:
-			working_dict[line_tag][KEY_ATTRIBUTES] = line_attributes
+			curr_index = ret_index + 1
+			continue
 
-	if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
-		if line_tag not in working_dict:
-			working_dict[line_tag] = list()
+		if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
+			if line_tag not in nest_dict:
+				nest_dict[line_tag] = list()
 
-		curr_dict = dict()
+			curr_dict = dict()
 
-		if line_text is not None:
-			curr_dict[KEY_TEXT] = line_text
+			if line_text is not None:
+				curr_dict[KEY_TEXT] = line_text
 
-		if line_attributes is not None:
-			for attribute in line_attributes:
-				curr_dict[attribute] = line_attributes[attribute]
+			if line_attributes is not None:
+				for attribute in line_attributes:
+					curr_dict[attribute] = line_attributes[attribute]
 
-		working_dict[line_tag].append(curr_dict)
+			nest_dict[line_tag].append(curr_dict)
 
-		convert_nest(nest, index + 1, working_dict)
+			curr_index += 1
+			continue
 
-	return working_dict
+		if line_type in [LINE_TYPE_END_NEST]:
+			return nest_dict, curr_index
 
+	return nest_dict, curr_index
 
 
 def divide_into_lines(input_file_name):
 	curr_str = ""
 	curr_nest = list()
-	curr_nest_tag = str()
+	curr_nest_tags = list() # Treating it as a stack
 
 	with open(input_file_name) as input_file:
 		for line in input_file:
@@ -219,29 +230,26 @@ def divide_into_lines(input_file_name):
 
 				if letter == '>' and (next_letter == '<' or next_letter == ""):
 					# Only return if nesting
-					# print(curr_str)
 					line_parsed = convert_line(curr_str)
 
 					tag = line_parsed.get(KEY_TAG, None)
+					assert tag != KEY_TEXT # This could cause a massive conflict, but it is unlikely
 					line_type = line_parsed.get(KEY_TYPE, None)
 					attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys()
 
-					if curr_nest_tag == str():
-						if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
-							curr_nest_tag = tag
-							curr_nest.append(line_parsed)
-						elif line_type != LINE_TYPE_IGNORE:
-							print("THIS VERSION")
-							print(json.dumps(line_parsed, indent=4)) # replacement for processing right now
-					else:
-						if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag:
-							print(json.dumps(curr_nest, indent=4)) # replacement for processing right now
-							nest_dict = convert_nest(curr_nest, 0, dict())
+					if line_type != LINE_TYPE_IGNORE:
+						curr_nest.append(line_parsed)
+
+					if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
+						curr_nest_tags.append(tag)
+					elif line_type == LINE_TYPE_END_NEST:
+						popped_curr_nest_tag = curr_nest_tags.pop()
+						assert popped_curr_nest_tag == tag
+						if len(curr_nest_tags) == 0:
+							nest_dict, _ = convert_nest(curr_nest, 0)
 							print(json.dumps(nest_dict, indent=4))
 							curr_nest = list()
 							curr_nest_tag = str()
-						else:
-							curr_nest.append(line_parsed)
 
 					curr_str = ""