diff --git a/CHANGELOG.md b/CHANGELOG.md index cb02ec0f5..b915396e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -74,6 +74,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Separated operator classes into their own files for better readability #625 - updated ENA XML and JSON schemas to 1.16 #628 - XML to JSON parser had to be adjusted for `assemblyGraph` +- XML validation errors now compile all the failing elements into same error message and include the line number in the error reason #630 ### Removed diff --git a/metadata_backend/helpers/validator.py b/metadata_backend/helpers/validator.py index 20a56fc39..116473e0c 100644 --- a/metadata_backend/helpers/validator.py +++ b/metadata_backend/helpers/validator.py @@ -2,19 +2,17 @@ import re from io import StringIO -from typing import Dict -from urllib.error import URLError +from typing import Dict, List import ujson from aiohttp import web -from defusedxml.ElementTree import ParseError -from defusedxml.ElementTree import tostring as etree_tostring +from defusedxml.ElementTree import ParseError, parse from jsonschema import Draft202012Validator, validators from jsonschema.exceptions import ValidationError from jsonschema.protocols import Validator -from xmlschema import XMLSchema, XMLSchemaValidationError +from xmlschema import XMLSchema, XMLSchemaChildrenValidationError -from ..helpers.logger import LOG +from .logger import LOG from .schema_loader import JSONSchemaLoader, SchemaNotFoundException @@ -38,9 +36,15 @@ def resp_body(self) -> str: :raises: HTTPBadRequest if URLError was raised during validation """ try: - self.schema.validate(self.xml_content) - LOG.info("Submitted file is totally valid.") - return ujson.dumps({"isValid": True}) + root = parse(StringIO(self.xml_content)).getroot() + errors: List = list(self.schema.iter_errors(root)) + if errors: + LOG.info("Submitted file contains some errors.") + response = self._format_xml_validation_error_reason(errors) + else: + LOG.info("Submitted file is totally valid.") + response = {"isValid": True} + return ujson.dumps(response) except ParseError as error: reason = self._parse_error_reason(error) @@ -52,33 +56,40 @@ def resp_body(self) -> str: LOG.exception("Submitted file does not not contain valid XML syntax.") return ujson.dumps({"isValid": False, "detail": {"reason": reason, "instance": instance}}) - except XMLSchemaValidationError as error: - # Parse reason and instance from the validation error message - reason = str(error.reason) - response: Dict = {"isValid": False, "detail": {"reason": reason}} - if error.elem: - instance = etree_tostring(error.elem, encoding="unicode") - # Replace element address in reason with instance element - if "<" in reason and ">" in reason: - instance_parent = "".join((instance.split(">")[0], ">")) - reason = re.sub("<[^>]*>", instance_parent + " ", reason) - response["detail"]["reason"] = reason - response["detail"]["instance"] = instance - - LOG.exception("Submitted file is not valid against schema.") - return ujson.dumps(response) - - except URLError as error: - reason = f"Faulty file was provided. {error.reason}." - LOG.error(reason) - raise web.HTTPBadRequest(reason=reason) - def _parse_error_reason(self, error: ParseError) -> str: - """Generate better error reason.""" + """Generate better error reason for ParseError.""" reason = str(error).split(":", maxsplit=1)[0] position = (str(error).split(":")[1])[1:] return f"Faulty XML file was given, {reason} at {position}" + def _format_xml_validation_error_reason(self, errors: List) -> Dict: + """Generate the response json object for validation error(s).""" + response: Dict = {"isValid": False, "detail": []} + found_lines = [] + for error in errors: + reason = str(error.reason) + instance = str(error.path) + + # Add line number to error reason + lines = self.xml_content.split("\n") + elem_name = error.obj[error.index].tag if isinstance(error, XMLSchemaChildrenValidationError) else error.obj + for (i, line) in enumerate(lines, 1): + if elem_name in line and i not in found_lines: + line_num = i + found_lines.append(i) + break + if re.match(r"^.*position [0-9]+", reason): + # line number replaces element position which is more valuable information + reason = re.sub(r"position [0-9]+", f"line {line_num}", reason) + else: + # line number still added as extra info to error reason + reason = reason + f" (line {line_num})" + detail = {"reason": reason, "instance": instance} + response["detail"].append(detail) + + response["detail"] = response["detail"][0] if len(response["detail"]) == 1 else response["detail"] + return response + @property def is_valid(self) -> bool: """Quick method for checking validation result.""" diff --git a/tests/test_files/study/SRP000539_invalid6.xml b/tests/test_files/study/SRP000539_invalid6.xml index 10f2ca99b..e84060c15 100644 --- a/tests/test_files/study/SRP000539_invalid6.xml +++ b/tests/test_files/study/SRP000539_invalid6.xml @@ -32,4 +32,5 @@ + diff --git a/tests/unit/test_handlers.py b/tests/unit/test_handlers.py index c6c6a9c2d..5a978577a 100644 --- a/tests/unit/test_handlers.py +++ b/tests/unit/test_handlers.py @@ -354,7 +354,7 @@ async def test_validation_passes_for_valid_xml(self): data = self.create_submission_data(files) response = await self.client.post(f"{API_PREFIX}/validate", data=data) self.assertEqual(response.status, 200) - self.assertIn('{"isValid":true}', await response.text()) + self.assertEqual({"isValid": True}, await response.json()) async def test_validation_fails_bad_schema(self): """Test validation fails for bad schema and valid xml.""" @@ -372,7 +372,9 @@ async def test_validation_fails_for_invalid_xml_syntax(self): response = await self.client.post(f"{API_PREFIX}/validate", data=data) resp_dict = await response.json() self.assertEqual(response.status, 200) - self.assertIn("Faulty XML file was given, mismatched tag", resp_dict["detail"]["reason"]) + self.assertEqual( + "Faulty XML file was given, mismatched tag at line 7, column 10", resp_dict["detail"]["reason"] + ) async def test_validation_fails_for_invalid_xml(self): """Test validation endpoint for invalid xml.""" @@ -382,7 +384,23 @@ async def test_validation_fails_for_invalid_xml(self): response = await self.client.post(f"{API_PREFIX}/validate", data=data) resp_dict = await response.json() self.assertEqual(response.status, 200) - self.assertIn("value must be one of", resp_dict["detail"]["reason"]) + self.assertIn("attribute existing_study_type='Something wrong'", resp_dict["detail"]["reason"]) + self.assertIn("(line 11)", resp_dict["detail"]["reason"]) + self.assertEqual("/STUDY_SET/STUDY/DESCRIPTOR/STUDY_TYPE", resp_dict["detail"]["instance"]) + + async def test_validation_fails_for_another_invalid_xml(self): + """Test validation endpoint for another invalid xml.""" + with self.p_get_sess_restapi: + files = [("study", "SRP000539_invalid6.xml")] + data = self.create_submission_data(files) + response = await self.client.post(f"{API_PREFIX}/validate", data=data) + resp_dict = await response.json() + self.assertEqual(response.status, 200) + self.assertEqual("Unexpected child with tag 'BAD_ELEMENT' at line 34.", resp_dict["detail"][0]["reason"]) + self.assertEqual( + "Unexpected child with tag 'ANOTHER_BAD_ELEMENT' at line 35.", resp_dict["detail"][1]["reason"] + ) + self.assertEqual("/STUDY_SET", resp_dict["detail"][0]["instance"]) async def test_validation_fails_with_too_many_files(self): """Test validation endpoint for too many files."""