diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb02ec0f5..b915396e7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -74,6 +74,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Separated operator classes into their own files for better readability #625
- updated ENA XML and JSON schemas to 1.16 #628
- XML to JSON parser had to be adjusted for `assemblyGraph`
+- XML validation errors now compile all the failing elements into same error message and include the line number in the error reason #630
### Removed
diff --git a/metadata_backend/helpers/validator.py b/metadata_backend/helpers/validator.py
index 20a56fc39..116473e0c 100644
--- a/metadata_backend/helpers/validator.py
+++ b/metadata_backend/helpers/validator.py
@@ -2,19 +2,17 @@
import re
from io import StringIO
-from typing import Dict
-from urllib.error import URLError
+from typing import Dict, List
import ujson
from aiohttp import web
-from defusedxml.ElementTree import ParseError
-from defusedxml.ElementTree import tostring as etree_tostring
+from defusedxml.ElementTree import ParseError, parse
from jsonschema import Draft202012Validator, validators
from jsonschema.exceptions import ValidationError
from jsonschema.protocols import Validator
-from xmlschema import XMLSchema, XMLSchemaValidationError
+from xmlschema import XMLSchema, XMLSchemaChildrenValidationError
-from ..helpers.logger import LOG
+from .logger import LOG
from .schema_loader import JSONSchemaLoader, SchemaNotFoundException
@@ -38,9 +36,15 @@ def resp_body(self) -> str:
:raises: HTTPBadRequest if URLError was raised during validation
"""
try:
- self.schema.validate(self.xml_content)
- LOG.info("Submitted file is totally valid.")
- return ujson.dumps({"isValid": True})
+ root = parse(StringIO(self.xml_content)).getroot()
+ errors: List = list(self.schema.iter_errors(root))
+ if errors:
+ LOG.info("Submitted file contains some errors.")
+ response = self._format_xml_validation_error_reason(errors)
+ else:
+ LOG.info("Submitted file is totally valid.")
+ response = {"isValid": True}
+ return ujson.dumps(response)
except ParseError as error:
reason = self._parse_error_reason(error)
@@ -52,33 +56,40 @@ def resp_body(self) -> str:
LOG.exception("Submitted file does not not contain valid XML syntax.")
return ujson.dumps({"isValid": False, "detail": {"reason": reason, "instance": instance}})
- except XMLSchemaValidationError as error:
- # Parse reason and instance from the validation error message
- reason = str(error.reason)
- response: Dict = {"isValid": False, "detail": {"reason": reason}}
- if error.elem:
- instance = etree_tostring(error.elem, encoding="unicode")
- # Replace element address in reason with instance element
- if "<" in reason and ">" in reason:
- instance_parent = "".join((instance.split(">")[0], ">"))
- reason = re.sub("<[^>]*>", instance_parent + " ", reason)
- response["detail"]["reason"] = reason
- response["detail"]["instance"] = instance
-
- LOG.exception("Submitted file is not valid against schema.")
- return ujson.dumps(response)
-
- except URLError as error:
- reason = f"Faulty file was provided. {error.reason}."
- LOG.error(reason)
- raise web.HTTPBadRequest(reason=reason)
-
def _parse_error_reason(self, error: ParseError) -> str:
- """Generate better error reason."""
+ """Generate better error reason for ParseError."""
reason = str(error).split(":", maxsplit=1)[0]
position = (str(error).split(":")[1])[1:]
return f"Faulty XML file was given, {reason} at {position}"
+ def _format_xml_validation_error_reason(self, errors: List) -> Dict:
+ """Generate the response json object for validation error(s)."""
+ response: Dict = {"isValid": False, "detail": []}
+ found_lines = []
+ for error in errors:
+ reason = str(error.reason)
+ instance = str(error.path)
+
+ # Add line number to error reason
+ lines = self.xml_content.split("\n")
+ elem_name = error.obj[error.index].tag if isinstance(error, XMLSchemaChildrenValidationError) else error.obj
+ for (i, line) in enumerate(lines, 1):
+ if elem_name in line and i not in found_lines:
+ line_num = i
+ found_lines.append(i)
+ break
+ if re.match(r"^.*position [0-9]+", reason):
+ # line number replaces element position which is more valuable information
+ reason = re.sub(r"position [0-9]+", f"line {line_num}", reason)
+ else:
+ # line number still added as extra info to error reason
+ reason = reason + f" (line {line_num})"
+ detail = {"reason": reason, "instance": instance}
+ response["detail"].append(detail)
+
+ response["detail"] = response["detail"][0] if len(response["detail"]) == 1 else response["detail"]
+ return response
+
@property
def is_valid(self) -> bool:
"""Quick method for checking validation result."""
diff --git a/tests/test_files/study/SRP000539_invalid6.xml b/tests/test_files/study/SRP000539_invalid6.xml
index 10f2ca99b..e84060c15 100644
--- a/tests/test_files/study/SRP000539_invalid6.xml
+++ b/tests/test_files/study/SRP000539_invalid6.xml
@@ -32,4 +32,5 @@
+
diff --git a/tests/unit/test_handlers.py b/tests/unit/test_handlers.py
index c6c6a9c2d..5a978577a 100644
--- a/tests/unit/test_handlers.py
+++ b/tests/unit/test_handlers.py
@@ -354,7 +354,7 @@ async def test_validation_passes_for_valid_xml(self):
data = self.create_submission_data(files)
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
self.assertEqual(response.status, 200)
- self.assertIn('{"isValid":true}', await response.text())
+ self.assertEqual({"isValid": True}, await response.json())
async def test_validation_fails_bad_schema(self):
"""Test validation fails for bad schema and valid xml."""
@@ -372,7 +372,9 @@ async def test_validation_fails_for_invalid_xml_syntax(self):
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
- self.assertIn("Faulty XML file was given, mismatched tag", resp_dict["detail"]["reason"])
+ self.assertEqual(
+ "Faulty XML file was given, mismatched tag at line 7, column 10", resp_dict["detail"]["reason"]
+ )
async def test_validation_fails_for_invalid_xml(self):
"""Test validation endpoint for invalid xml."""
@@ -382,7 +384,23 @@ async def test_validation_fails_for_invalid_xml(self):
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
- self.assertIn("value must be one of", resp_dict["detail"]["reason"])
+ self.assertIn("attribute existing_study_type='Something wrong'", resp_dict["detail"]["reason"])
+ self.assertIn("(line 11)", resp_dict["detail"]["reason"])
+ self.assertEqual("/STUDY_SET/STUDY/DESCRIPTOR/STUDY_TYPE", resp_dict["detail"]["instance"])
+
+ async def test_validation_fails_for_another_invalid_xml(self):
+ """Test validation endpoint for another invalid xml."""
+ with self.p_get_sess_restapi:
+ files = [("study", "SRP000539_invalid6.xml")]
+ data = self.create_submission_data(files)
+ response = await self.client.post(f"{API_PREFIX}/validate", data=data)
+ resp_dict = await response.json()
+ self.assertEqual(response.status, 200)
+ self.assertEqual("Unexpected child with tag 'BAD_ELEMENT' at line 34.", resp_dict["detail"][0]["reason"])
+ self.assertEqual(
+ "Unexpected child with tag 'ANOTHER_BAD_ELEMENT' at line 35.", resp_dict["detail"][1]["reason"]
+ )
+ self.assertEqual("/STUDY_SET", resp_dict["detail"][0]["instance"])
async def test_validation_fails_with_too_many_files(self):
"""Test validation endpoint for too many files."""