Skip to content

Commit

Permalink
Manually add line numbers to xml validation errors
Browse files Browse the repository at this point in the history
This makes the error messages slightly more descriptive.
Also this enables the validator to parse all the errors
from the file at once.
  • Loading branch information
csc-jm committed Oct 27, 2022
1 parent 55f2be3 commit 5cab919
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 34 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Separated operator classes into their own files for better readability #625
- updated ENA XML and JSON schemas to 1.16 #628
- XML to JSON parser had to be adjusted for `assemblyGraph`
- XML validation errors now compile all the failing elements into same error message and include the line number in the error reason #630

### Removed

Expand Down
73 changes: 42 additions & 31 deletions metadata_backend/helpers/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,17 @@

import re
from io import StringIO
from typing import Dict
from urllib.error import URLError
from typing import Dict, List

import ujson
from aiohttp import web
from defusedxml.ElementTree import ParseError
from defusedxml.ElementTree import tostring as etree_tostring
from defusedxml.ElementTree import ParseError, parse
from jsonschema import Draft202012Validator, validators
from jsonschema.exceptions import ValidationError
from jsonschema.protocols import Validator
from xmlschema import XMLSchema, XMLSchemaValidationError
from xmlschema import XMLSchema, XMLSchemaChildrenValidationError

from ..helpers.logger import LOG
from .logger import LOG
from .schema_loader import JSONSchemaLoader, SchemaNotFoundException


Expand All @@ -38,9 +36,15 @@ def resp_body(self) -> str:
:raises: HTTPBadRequest if URLError was raised during validation
"""
try:
self.schema.validate(self.xml_content)
LOG.info("Submitted file is totally valid.")
return ujson.dumps({"isValid": True})
root = parse(StringIO(self.xml_content)).getroot()
errors: List = list(self.schema.iter_errors(root))
if errors:
LOG.info("Submitted file contains some errors.")
response = self._format_xml_validation_error_reason(errors)
else:
LOG.info("Submitted file is totally valid.")
response = {"isValid": True}
return ujson.dumps(response)

except ParseError as error:
reason = self._parse_error_reason(error)
Expand All @@ -52,33 +56,40 @@ def resp_body(self) -> str:
LOG.exception("Submitted file does not not contain valid XML syntax.")
return ujson.dumps({"isValid": False, "detail": {"reason": reason, "instance": instance}})

except XMLSchemaValidationError as error:
# Parse reason and instance from the validation error message
reason = str(error.reason)
response: Dict = {"isValid": False, "detail": {"reason": reason}}
if error.elem:
instance = etree_tostring(error.elem, encoding="unicode")
# Replace element address in reason with instance element
if "<" in reason and ">" in reason:
instance_parent = "".join((instance.split(">")[0], ">"))
reason = re.sub("<[^>]*>", instance_parent + " ", reason)
response["detail"]["reason"] = reason
response["detail"]["instance"] = instance

LOG.exception("Submitted file is not valid against schema.")
return ujson.dumps(response)

except URLError as error:
reason = f"Faulty file was provided. {error.reason}."
LOG.error(reason)
raise web.HTTPBadRequest(reason=reason)

def _parse_error_reason(self, error: ParseError) -> str:
"""Generate better error reason."""
"""Generate better error reason for ParseError."""
reason = str(error).split(":", maxsplit=1)[0]
position = (str(error).split(":")[1])[1:]
return f"Faulty XML file was given, {reason} at {position}"

def _format_xml_validation_error_reason(self, errors: List) -> Dict:
"""Generate the response json object for validation error(s)."""
response: Dict = {"isValid": False, "detail": []}
found_lines = []
for error in errors:
reason = str(error.reason)
instance = str(error.path)

# Add line number to error reason
lines = self.xml_content.split("\n")
elem_name = error.obj[error.index].tag if isinstance(error, XMLSchemaChildrenValidationError) else error.obj
for (i, line) in enumerate(lines, 1):
if elem_name in line and i not in found_lines:
line_num = i
found_lines.append(i)
break
if re.match(r"^.*position [0-9]+", reason):
# line number replaces element position which is more valuable information
reason = re.sub(r"position [0-9]+", f"line {line_num}", reason)
else:
# line number still added as extra info to error reason
reason = reason + f" (line {line_num})"
detail = {"reason": reason, "instance": instance}
response["detail"].append(detail)

response["detail"] = response["detail"][0] if len(response["detail"]) == 1 else response["detail"]
return response

@property
def is_valid(self) -> bool:
"""Quick method for checking validation result."""
Expand Down
1 change: 1 addition & 0 deletions tests/test_files/study/SRP000539_invalid6.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@
</STUDY_ATTRIBUTES>
</STUDY>
<BAD_ELEMENT></BAD_ELEMENT>
<ANOTHER_BAD_ELEMENT></ANOTHER_BAD_ELEMENT>
</STUDY_SET>
24 changes: 21 additions & 3 deletions tests/unit/test_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ async def test_validation_passes_for_valid_xml(self):
data = self.create_submission_data(files)
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
self.assertEqual(response.status, 200)
self.assertIn('{"isValid":true}', await response.text())
self.assertEqual({"isValid": True}, await response.json())

async def test_validation_fails_bad_schema(self):
"""Test validation fails for bad schema and valid xml."""
Expand All @@ -372,7 +372,9 @@ async def test_validation_fails_for_invalid_xml_syntax(self):
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
self.assertIn("Faulty XML file was given, mismatched tag", resp_dict["detail"]["reason"])
self.assertEqual(
"Faulty XML file was given, mismatched tag at line 7, column 10", resp_dict["detail"]["reason"]
)

async def test_validation_fails_for_invalid_xml(self):
"""Test validation endpoint for invalid xml."""
Expand All @@ -382,7 +384,23 @@ async def test_validation_fails_for_invalid_xml(self):
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
self.assertIn("value must be one of", resp_dict["detail"]["reason"])
self.assertIn("attribute existing_study_type='Something wrong'", resp_dict["detail"]["reason"])
self.assertIn("(line 11)", resp_dict["detail"]["reason"])
self.assertEqual("/STUDY_SET/STUDY/DESCRIPTOR/STUDY_TYPE", resp_dict["detail"]["instance"])

async def test_validation_fails_for_another_invalid_xml(self):
"""Test validation endpoint for another invalid xml."""
with self.p_get_sess_restapi:
files = [("study", "SRP000539_invalid6.xml")]
data = self.create_submission_data(files)
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
self.assertEqual("Unexpected child with tag 'BAD_ELEMENT' at line 34.", resp_dict["detail"][0]["reason"])
self.assertEqual(
"Unexpected child with tag 'ANOTHER_BAD_ELEMENT' at line 35.", resp_dict["detail"][1]["reason"]
)
self.assertEqual("/STUDY_SET", resp_dict["detail"][0]["instance"])

async def test_validation_fails_with_too_many_files(self):
"""Test validation endpoint for too many files."""
Expand Down

0 comments on commit 5cab919

Please sign in to comment.