From fcc3270265b5a6c0baff55bd742e2c3689b73387 Mon Sep 17 00:00:00 2001 From: GodMeowIceSun Date: Fri, 31 May 2024 15:17:08 +0800 Subject: [PATCH] fix(#456): fix default charset problem Signed-off-by: GodMeowIceSun --- .../owasp/validator/html/CleanResults.java | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/owasp/validator/html/CleanResults.java b/src/main/java/org/owasp/validator/html/CleanResults.java index fad145a..bea2cab 100644 --- a/src/main/java/org/owasp/validator/html/CleanResults.java +++ b/src/main/java/org/owasp/validator/html/CleanResults.java @@ -25,10 +25,14 @@ package org.owasp.validator.html; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.Callable; +import java.util.stream.Collectors; + import org.w3c.dom.DocumentFragment; /** @@ -163,7 +167,22 @@ public DocumentFragment getCleanXMLDocumentFragment() { * @see #getCleanHTML() */ public List getErrorMessages() { - return errorMessages; + return getErrorMessages(Charset.defaultCharset()); + } + + /** + * Return a list of error messages -- but an empty list returned does not mean there was no attack + * present, due to the serialization and deserialization process automatically cleaning up some + * attacks. Only the output of the {@code getCleanHTML()} should be considered safe. See the + * project README file and {@code CleanResults} class documentation for further discussion. + * + * @param charset - The character set for returning error messages. + * @return An ArrayList object which contains the error messages, if any, after a scan. + * @see Project README + * @see #getCleanHTML() + */ + public List getErrorMessages(Charset charset) { + return Collections.unmodifiableList(errorMessages.stream().map(i->new String(i.getBytes(StandardCharsets.ISO_8859_1), charset)).collect(Collectors.toList())); } /**