From 139ed2c504c0476b5933eac44953e3c5ed4ee27d Mon Sep 17 00:00:00 2001 From: Xenios91 Date: Sat, 29 Sep 2018 03:02:56 -0500 Subject: [PATCH 1/3] Made some bug fixes and corrected 2 XML vulnerabilities. --- .../src/main/java/us/codecraft/webmagic/Site.java | 4 ++-- .../webmagic/downloader/HttpClientGenerator.java | 12 ++++++------ .../us/codecraft/webmagic/downloader/package.html | 1 + .../java/us/codecraft/webmagic/model/package.html | 2 ++ .../java/us/codecraft/webmagic/utils/IPUtils.java | 12 ++++++++---- .../codecraft/webmagic/selector/Xpath2Selector.java | 9 +++++++-- .../downloader/selenium/SeleniumDownloader.java | 4 ++-- 7 files changed, 28 insertions(+), 16 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index b6963ca43..27a92dbc8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -98,9 +98,9 @@ public Site setUserAgent(String userAgent) { /** * get cookies * - * @return get cookies + * @return get defaultCookies */ - public Map getCookies() { + public Map getDefaultCookies() { return defaultCookies; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 28a16f41d..8fc6ff34f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -65,15 +65,15 @@ private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyM // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { - @Override - public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { + + public void checkClientTrusted(X509Certificate[] chain, String authType) { } - @Override - public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { + + public void checkServerTrusted(X509Certificate[] chain, String authType) { } - @Override + public X509Certificate[] getAcceptedIssuers() { return null; } @@ -135,7 +135,7 @@ private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { return; } CookieStore cookieStore = new BasicCookieStore(); - for (Map.Entry cookieEntry : site.getCookies().entrySet()) { + for (Map.Entry cookieEntry : site.getDefaultCookies().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(site.getDomain()); cookieStore.addCookie(cookie); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html index 719abd975..b74f141dc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html @@ -1,3 +1,4 @@ + Downloader is the part that downloads web pages and store in Page object. diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html index 63a6784c6..bbff357fd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html @@ -1,4 +1,6 @@ + + <body> Page model and annotations used to customize a crawler. </body> diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java index 3d416964b..be1288c22 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java @@ -22,10 +22,14 @@ public static String getFirstNoLoopbackIPAddresses() throws SocketException { Enumeration<InetAddress> inetAddresses = networkInterface.getInetAddresses(); while (inetAddresses.hasMoreElements()) { InetAddress address = inetAddresses.nextElement(); - if (!address.isLoopbackAddress() && !Inet6Address.class.isInstance(address)) { - return address.getHostAddress(); - } else if (!address.isLoopbackAddress()) { - localAddress = address; + if (address != null) { + if (!address.isLoopbackAddress() && !Inet6Address.class.isInstance(address)) { + return address.getHostAddress(); + } else if (!address.isLoopbackAddress()) { + localAddress = address; + } + }else{ + throw new NullPointerException("Address is null"); } } } diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 98b1efe4b..0c1e0081a 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -11,6 +11,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; +import javax.xml.XMLConstants; import javax.xml.namespace.NamespaceContext; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -126,7 +127,9 @@ public String select(String text) { return item.getTextContent(); } else { StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); + TransformerFactory tff = TransformerFactory.newInstance(); + tff.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + Transformer transformer = tff.newTransformer(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); transformer.transform(new DOMSource(item), xmlOutput); return xmlOutput.getWriter().toString(); @@ -154,7 +157,9 @@ public List<String> selectList(String text) { } if (result instanceof NodeList) { NodeList nodeList = (NodeList) result; - Transformer transformer = TransformerFactory.newInstance().newTransformer(); + TransformerFactory tff = TransformerFactory.newInstance(); + tff.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + Transformer transformer = tff.newTransformer(); StreamResult xmlOutput = new StreamResult(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); for (int i = 0; i < nodeList.getLength(); i++) { diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index f45f7e2a8..c13e98047 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -87,8 +87,8 @@ public Page download(Request request, Task task) { } WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry<String, String> cookieEntry : site.getCookies() + if (site.getDefaultCookies() != null) { + for (Map.Entry<String, String> cookieEntry : site.getDefaultCookies() .entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); From 828398ac83e78073224c231348fedd3b5006ace8 Mon Sep 17 00:00:00 2001 From: Xenios91 <hartman91@ymail.com> Date: Sat, 29 Sep 2018 03:24:26 -0500 Subject: [PATCH 2/3] Made some bug and vulnerability fixes. --- .../downloader/HttpClientGenerator.java | 4 ++-- .../webmagic/downloader/package.html | 1 + .../java/us/codecraft/webmagic/package.html | 11 ++++++----- .../webmagic/pipeline/FilePipeline.java | 2 +- .../codecraft/webmagic/pipeline/package.html | 1 + .../codecraft/webmagic/processor/package.html | 2 ++ .../codecraft/webmagic/scheduler/package.html | 1 + .../us/codecraft/webmagic/selector/Html.java | 2 +- .../webmagic/utils/FilePersistentBase.java | 19 ++++++------------- .../model/formatter/BasicTypeFormatter.java | 4 ++-- .../pipeline/FilePageModelPipeline.java | 2 +- .../pipeline/JsonFilePageModelPipeline.java | 2 +- .../webmagic/pipeline/JsonFilePipeline.java | 2 +- 13 files changed, 26 insertions(+), 27 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 8fc6ff34f..bb53d170c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -66,11 +66,11 @@ private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyM X509TrustManager trustManager = new X509TrustManager() { - public void checkClientTrusted(X509Certificate[] chain, String authType) { + public void checkClientTrusted(X509Certificate[] chain, String authType) throws IllegalArgumentException { } - public void checkServerTrusted(X509Certificate[] chain, String authType) { + public void checkServerTrusted(X509Certificate[] chain, String authType) throws IllegalArgumentException { } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html index b74f141dc..95ef786fb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html @@ -1,6 +1,7 @@ <!DOCTYPE html> <html> <body> + <title>"Package" Downloader is the part that downloads web pages and store in Page object. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html index 491afd93b..9c3fd3fb5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html @@ -1,7 +1,8 @@ - -
- Main class "Spider" and models. -
- + +"Package" +
+ Main class "Spider" and models. +
+ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index be9fd7cc2..08bed9566 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -36,7 +36,7 @@ public FilePipeline(String path) { @Override public void process(ResultItems resultItems, Task task) { - String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; + String path = this.path + PATH_SEPARATOR + task.getUUID() + PATH_SEPARATOR; try { PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html index 6b0fcee22..adcb81d3d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html @@ -1,5 +1,6 @@ + "Package" Pipeline is the persistent and offline process part of crawler. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html index 5ec7537b0..5caca3675 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html @@ -1,4 +1,6 @@ + +"Package" PageProcessor custom part of a crawler for specific site. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html index e67edcc6b..0ff5e6790 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html @@ -1,5 +1,6 @@ + "Package" Scheduler is the part of url management. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index f2218f126..1a4ccb361 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -23,7 +23,7 @@ public class Html extends HtmlNode { * Disable jsoup html entity escape. It can be set just before any Html instance is created. * @deprecated */ - public static boolean DISABLE_HTML_ENTITY_ESCAPE = false; + public static final boolean DISABLE_HTML_ENTITY_ESCAPE = false; /** * Store parsed document for better performance when only one text exist. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java index 79b9efece..0e5e77e2d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java @@ -12,29 +12,22 @@ public class FilePersistentBase { protected String path; - public static String PATH_SEPERATOR = "/"; - - static { - String property = System.getProperties().getProperty("file.separator"); - if (property != null) { - PATH_SEPERATOR = property; - } - } + protected static final String PATH_SEPARATOR = System.getProperties().getProperty("file.separator"); public void setPath(String path) { - if (!path.endsWith(PATH_SEPERATOR)) { - path += PATH_SEPERATOR; + if (!path.endsWith(PATH_SEPARATOR)) { + path += PATH_SEPARATOR; } this.path = path; } - public File getFile(String fullName) { + protected File getFile(String fullName) { checkAndMakeParentDirecotry(fullName); return new File(fullName); } - public void checkAndMakeParentDirecotry(String fullName) { - int index = fullName.lastIndexOf(PATH_SEPERATOR); + private void checkAndMakeParentDirecotry(String fullName) { + int index = fullName.lastIndexOf(PATH_SEPARATOR); if (index > 0) { String path = fullName.substring(0, index); File file = new File(path); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index f9d76a845..a5084304e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -25,11 +25,11 @@ public T format(String raw) throws Exception { protected abstract T formatTrimmed(String raw) throws Exception; - public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, + protected static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); - public static Class detectBasicClass(Class type) { + protected static Class detectBasicClass(Class type) { if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { return Integer.class; } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java index 0db9b819d..c0eb94245 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/FilePageModelPipeline.java @@ -37,7 +37,7 @@ public FilePageModelPipeline(String path) { @Override public void process(Object o, Task task) { - String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; + String path = this.path + PATH_SEPARATOR + task.getUUID() + PATH_SEPARATOR; try { String filename; if (o instanceof HasKey) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index 7a7f80a25..431e7b2e6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -38,7 +38,7 @@ public JsonFilePageModelPipeline(String path) { @Override public void process(Object o, Task task) { - String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; + String path = this.path + PATH_SEPARATOR + task.getUUID() + PATH_SEPARATOR; try { String filename; if (o instanceof HasKey) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index 3ff42bf10..a4632dc7e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -35,7 +35,7 @@ public JsonFilePipeline(String path) { @Override public void process(ResultItems resultItems, Task task) { - String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; + String path = this.path + PATH_SEPARATOR + task.getUUID() + PATH_SEPARATOR; try { PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"))); printWriter.write(JSON.toJSONString(resultItems.getAll())); From 99eecb02cf0da976447403af6cd8949b14d150e8 Mon Sep 17 00:00:00 2001 From: Corey Date: Sat, 29 Sep 2018 21:13:07 -0500 Subject: [PATCH 3/3] Small change Set a value to final that broke the build --- .../src/main/java/us/codecraft/webmagic/selector/Html.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 1a4ccb361..f2218f126 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -23,7 +23,7 @@ public class Html extends HtmlNode { * Disable jsoup html entity escape. It can be set just before any Html instance is created. * @deprecated */ - public static final boolean DISABLE_HTML_ENTITY_ESCAPE = false; + public static boolean DISABLE_HTML_ENTITY_ESCAPE = false; /** * Store parsed document for better performance when only one text exist.