Skip to content

Commit

Permalink
Merge pull request #630 from CSCfi/feature/add-line-numbers-to-valida…
Browse files Browse the repository at this point in the history
…tion-errors

Add line numbers to xml validation errors
  • Loading branch information
csc-felipe authored Nov 2, 2022
2 parents 4b1f489 + 2ce8086 commit 97ab973
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 37 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Refactor `Operator` -> `ObjectOperator` #627
- Refactor `xml_object.py` -> `object_xml.py` #627
- Refactor operators to use a common base class called `BaseOperator` #627
- XML validation errors now compile all the failing elements into same error message and include the line number in the error reason #630

### Removed

Expand Down
7 changes: 4 additions & 3 deletions docs/specification.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/403Forbidden"
/v1/submit:
/v1/submit/{workflow}:
post:
tags:
- Submission
Expand Down Expand Up @@ -1823,17 +1823,18 @@ components:
properties:
isValid:
type: boolean
description: true of false if valid. if false a detail key would be good to detail the error but not required
description: True or false if valid. If false, a detail key is included that details the error
detail:
type: object
description: details of the error if there was one.
description: details of the error if there were some
properties:
reason:
type: string
description: reason that caused the validation error
instance:
type: string
description: specific element(s) where error occurred

SubmissionCreated:
type: object
required:
Expand Down
73 changes: 42 additions & 31 deletions metadata_backend/helpers/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,17 @@

import re
from io import StringIO
from typing import Dict
from urllib.error import URLError
from typing import Dict, List

import ujson
from aiohttp import web
from defusedxml.ElementTree import ParseError
from defusedxml.ElementTree import tostring as etree_tostring
from defusedxml.ElementTree import ParseError, parse
from jsonschema import Draft202012Validator, validators
from jsonschema.exceptions import ValidationError
from jsonschema.protocols import Validator
from xmlschema import XMLSchema, XMLSchemaValidationError
from xmlschema import XMLSchema, XMLSchemaChildrenValidationError

from ..helpers.logger import LOG
from .logger import LOG
from .schema_loader import JSONSchemaLoader, SchemaNotFoundException


Expand All @@ -38,9 +36,15 @@ def resp_body(self) -> str:
:raises: HTTPBadRequest if URLError was raised during validation
"""
try:
self.schema.validate(self.xml_content)
LOG.info("Submitted file is totally valid.")
return ujson.dumps({"isValid": True})
root = parse(StringIO(self.xml_content)).getroot()
errors: List = list(self.schema.iter_errors(root))
if errors:
LOG.info("Submitted file contains some errors.")
response = self._format_xml_validation_error_reason(errors)
else:
LOG.info("Submitted file is totally valid.")
response = {"isValid": True}
return ujson.dumps(response)

except ParseError as error:
reason = self._parse_error_reason(error)
Expand All @@ -52,33 +56,40 @@ def resp_body(self) -> str:
LOG.exception("Submitted file does not not contain valid XML syntax.")
return ujson.dumps({"isValid": False, "detail": {"reason": reason, "instance": instance}})

except XMLSchemaValidationError as error:
# Parse reason and instance from the validation error message
reason = str(error.reason)
response: Dict = {"isValid": False, "detail": {"reason": reason}}
if error.elem:
instance = etree_tostring(error.elem, encoding="unicode")
# Replace element address in reason with instance element
if "<" in reason and ">" in reason:
instance_parent = "".join((instance.split(">")[0], ">"))
reason = re.sub("<[^>]*>", instance_parent + " ", reason)
response["detail"]["reason"] = reason
response["detail"]["instance"] = instance

LOG.exception("Submitted file is not valid against schema.")
return ujson.dumps(response)

except URLError as error:
reason = f"Faulty file was provided. {error.reason}."
LOG.error(reason)
raise web.HTTPBadRequest(reason=reason)

def _parse_error_reason(self, error: ParseError) -> str:
"""Generate better error reason."""
"""Generate better error reason for ParseError."""
reason = str(error).split(":", maxsplit=1)[0]
position = (str(error).split(":")[1])[1:]
return f"Faulty XML file was given, {reason} at {position}"

def _format_xml_validation_error_reason(self, errors: List) -> Dict:
"""Generate the response json object for validation error(s)."""
response: Dict = {"isValid": False, "detail": {"reason": "", "instance": ""}}
found_lines = []
for error in errors:
reason = str(error.reason)
instance = str(error.path)

# Add line number to error reason
lines = self.xml_content.split("\n")
elem_name = error.obj[error.index].tag if isinstance(error, XMLSchemaChildrenValidationError) else error.obj
for (i, line) in enumerate(lines, 1):
if elem_name in line and i not in found_lines:
line_num = i
found_lines.append(i)
break
if re.match(r"^.*at position [0-9]+", reason):
# line number replaces element position which is more valuable information
reason = re.sub(r"position [0-9]+", f"line {line_num}", reason)
else:
# line number still added as extra info to error reason
reason = reason + f" (line {line_num})"
response["detail"]["reason"] = response["detail"]["reason"] + reason + "\n"
response["detail"]["instance"] = response["detail"]["instance"] + instance + "\n"

response["detail"] = response["detail"][0] if len(response["detail"]) == 1 else response["detail"]
return response

@property
def is_valid(self) -> bool:
"""Quick method for checking validation result."""
Expand Down
1 change: 1 addition & 0 deletions tests/test_files/study/SRP000539_invalid6.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@
</STUDY_ATTRIBUTES>
</STUDY>
<BAD_ELEMENT></BAD_ELEMENT>
<ANOTHER_BAD_ELEMENT></ANOTHER_BAD_ELEMENT>
</STUDY_SET>
24 changes: 21 additions & 3 deletions tests/unit/test_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ async def test_validation_passes_for_valid_xml(self):
data = self.create_submission_data(files)
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
self.assertEqual(response.status, 200)
self.assertIn('{"isValid":true}', await response.text())
self.assertEqual({"isValid": True}, await response.json())

async def test_validation_fails_bad_schema(self):
"""Test validation fails for bad schema and valid xml."""
Expand All @@ -393,7 +393,9 @@ async def test_validation_fails_for_invalid_xml_syntax(self):
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
self.assertIn("Faulty XML file was given, mismatched tag", resp_dict["detail"]["reason"])
self.assertEqual(
"Faulty XML file was given, mismatched tag at line 7, column 10", resp_dict["detail"]["reason"]
)

async def test_validation_fails_for_invalid_xml(self):
"""Test validation endpoint for invalid xml."""
Expand All @@ -403,7 +405,23 @@ async def test_validation_fails_for_invalid_xml(self):
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
self.assertIn("value must be one of", resp_dict["detail"]["reason"])
self.assertIn("attribute existing_study_type='Something wrong'", resp_dict["detail"]["reason"])
self.assertIn("(line 11)", resp_dict["detail"]["reason"])
self.assertEqual("/STUDY_SET/STUDY/DESCRIPTOR/STUDY_TYPE\n", resp_dict["detail"]["instance"])

async def test_validation_fails_for_another_invalid_xml(self):
"""Test validation endpoint for another invalid xml."""
with self.p_get_sess_restapi:
files = [("study", "SRP000539_invalid6.xml")]
data = self.create_submission_data(files)
response = await self.client.post(f"{API_PREFIX}/validate", data=data)
resp_dict = await response.json()
self.assertEqual(response.status, 200)
self.assertIn("Unexpected child with tag 'BAD_ELEMENT' at line 34.\n", resp_dict["detail"]["reason"])
self.assertIn(
"Unexpected child with tag 'ANOTHER_BAD_ELEMENT' at line 35.\n", resp_dict["detail"]["reason"]
)
self.assertEqual("/STUDY_SET\n/STUDY_SET\n", resp_dict["detail"]["instance"])

async def test_validation_fails_with_too_many_files(self):
"""Test validation endpoint for too many files."""
Expand Down

0 comments on commit 97ab973

Please sign in to comment.