Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NUTCH-2856] Implement a protocol-smb plugin based on hierynomus/smbj #826

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ lib/spotbugs-*
ivy/dependency-check-ant/*
.gradle*
ivy/apache-rat-*
.vscode
3 changes: 2 additions & 1 deletion conf/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
<Appenders>
<RollingFile name="RollingFile" fileName="${hadoop.log.dir}/${hadoop.log.file}"
filePattern="${hadoop.log.dir}/$${date:yyyy-MM}/nutch-%d{yyyy-MM-dd}.log.gz">
<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />
<!--<PatternLayout pattern="%d %p %c{1.} [%t] %m%n" />-->
<PatternLayout pattern="%d %p %c [%t] %m%n" />
<CronTriggeringPolicy schedule="0 0 0 * * ?" evaluateOnStartup="true" />
<DefaultRolloverStrategy>
<Delete basePath="${hadoop.log.dir}" maxDepth="2">
Expand Down
22 changes: 22 additions & 0 deletions conf/url-authentication.xml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide ALv2 header

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!-- todo: This file should be some encrypted vault -->
<url-authentication>
<authentication pattern="^smb://host/share.*" user="user" domain="domain" password="password"/>
<authentication pattern="^smb://hiran@nas/Documents.*" user="hiran" domain="domain" password="password"/>
</url-authentication>
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/crawl/Injector.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ private String filterNormalize(String url) {
if (filters != null)
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
LOG.warn("Skipping {}", url, e);
url = null;
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
<ant dir="protocol-httpclient" target="deploy"/>
<ant dir="protocol-interactiveselenium" target="deploy" />
<ant dir="protocol-okhttp" target="deploy"/>
<ant dir="protocol-smb" target="deploy"/>
<ant dir="protocol-selenium" target="deploy" />
<ant dir="publish-rabbitmq" target="deploy"/>
<ant dir="scoring-depth" target="deploy"/>
Expand Down Expand Up @@ -142,6 +143,7 @@
<ant dir="protocol-http" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
<ant dir="protocol-okhttp" target="test"/>
<ant dir="protocol-smb" target="test"/>
<ant dir="scoring-orphan" target="test"/>
<ant dir="scoring-metadata" target="test"/>
<ant dir="subcollection" target="test"/>
Expand Down Expand Up @@ -226,6 +228,7 @@
<ant dir="protocol-httpclient" target="clean"/>
<ant dir="protocol-interactiveselenium" target="clean" />
<ant dir="protocol-okhttp" target="clean"/>
<ant dir="protocol-smb" target="clean"/>
<ant dir="protocol-selenium" target="clean" />
<ant dir="publish-rabbitmq" target="clean"/>
<ant dir="scoring-depth" target="clean"/>
Expand Down
22 changes: 22 additions & 0 deletions src/plugin/protocol-smb/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="protocol-smb" default="jar-core">

<import file="../build-plugin.xml"/>

</project>
54 changes: 54 additions & 0 deletions src/plugin/protocol-smb/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<ivy-module xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="http://ant.apache.org/ivy/schemas/ivy.xsd"
xmlns:ns0="http://ant.apache.org/ivy/maven" version="2.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../..//ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
<dependency org="com.hierynomus" name="smbj" rev="0.13.0"/>
<!--
These dependencies are either contained in smbj (transitive) or
already provided by Nutch itself. Hence they can remain commented out.

<dependency org="net.engio" name="mbassador" rev="1.3.0"/>
<dependency org="org.bouncycastle" name="bcprov-jdk18on" rev="1.75"/>
<dependency org="com.hierynomus" name="asn-one" rev="0.6.0"/>
<dependency org="commons-io" name="commons-io" rev="2.17.0"/>
-->
<dependency org="com.google.guava" name="guava" rev="33.3.1-jre"/>

</dependencies>

</ivy-module>
53 changes: 53 additions & 0 deletions src/plugin/protocol-smb/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<plugin
id="protocol-smb"
name="SMB Protocol based on https://github.com/hierynomus/smbj"
version="1.0.0"
provider-name="Hiran Chaudhuri">

<runtime>
<library name="asn-one-0.6.0.jar"/>
<library name="bcprov-jdk18on-1.75.jar"/>
<library name="mbassador-1.3.0.jar"/>
<library name="protocol-smb.jar">
<export name="*"/>
</library>
<library name="smbj-0.13.0.jar"/>

<library name="commons-io-2.17.0.jar"/>
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
</requires>

<extension id="org.apache.nutch.protocol.smb"
name="SmbProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.smb.Smb"
class="org.apache.nutch.protocol.smb.SmbProtocol">
<parameter name="protocolName" value="smb"/>
<parameter name="urlStreamHandler" value="org.apache.nutch.protocol.smb.SmbHandler"/>
</implementation>

</extension>

</plugin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.smb;

import java.net.URL;
import java.net.URLConnection;
import java.net.URLStreamHandler;

public class SmbHandler extends URLStreamHandler {

@Override
protected URLConnection openConnection(URL u) {
return new SmbURLConnection(u);
}
}
Loading