-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild.xml
165 lines (138 loc) · 7.57 KB
/
build.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
<?xml version="1.0"?>
<project name="OpenSextant-Gazetteer" default="zip" xmlns:ivy="antlib:org.apache.ivy.ant">
<!-- import properties -->
<property file="${basedir}/build.properties" />
<!--set the time stamps -->
<tstamp />
<!-- directory to contain the created artifacts -->
<property name="build.dir" location="build" />
<property name="version" value="2.1" />
<property name="outputDir" location="${basedir}/GazetteerETL/GeoData/Merged" />
<property name="resourceDir" location="${basedir}/GazetteerETL/Resources" />
<!-- the Kettle launcher scripts for win and linux -->
<property name="kitchen.script.win" location="${kettle.home}/kitchen.bat" />
<property name="kitchen.script.unix" location="${kettle.home}/kitchen.sh" />
<!-- the Kettle script which does all the work -->
<property name="processGazetteer.script" location="${basedir}/GazetteerETL/BuildMergedGazetteer.kjb" />
<!-- OS detection - kettle scripts are tailored based on OS -->
<condition property="isWindows" value="true">
<os family="windows" />
</condition>
<condition property="isUnix" value="true">
<or>
<os family="mac" />
<os family="unix" />
</or>
</condition>
<property name="publish.gaz.name" value="opensextant-gazetteer-${DSTAMP}.zip" />
<!--====================== Targets ============================-->
<!-- deletes generated gazetteer data and logs -->
<target name="clean">
<delete quiet="true" includeemptydirs="true">
<fileset dir="${basedir}/GazetteerETL/Logs" includes="**/*" />
<fileset dir="${basedir}/GazetteerETL/GeoData/Merged" includes="**/*" />
<fileset dir="${basedir}/GazetteerETL/GeoData/transformed" includes="**/*" />
</delete>
<delete dir="${build.dir}" />
<mkdir dir="${build.dir}" />
</target>
<!-- create the dir structure for the downloaded and generated data -->
<target name="prepare">
<mkdir dir="${basedir}/GazetteerETL/Logs" />
<mkdir dir="${basedir}/GazetteerETL/GeoData/NGA" />
<mkdir dir="${basedir}/GazetteerETL/GeoData/USGS" />
<mkdir dir="${basedir}/GazetteerETL/GeoData/Merged" />
<mkdir dir="${basedir}/GazetteerETL/GeoData/transformed" />
</target>
<!-- copies of required jars, included in lib dir -->
<!-- copy these to Kettle's lib dir -->
<target name="copy-libs">
<!--copy todir="${kettle.home}/lib" file="${basedir}/GazetteerETL/lib/opensextant-phonetics.jar" /-->
<copy todir="${kettle.home}/lib" file="${basedir}/GazetteerETL/lib/opensextant.jar" />
<!-- if using a Kettle version < 6.1 uncomment line below to get common codecs -->
<!--copy todir="${kettle.home}/lib" file="${basedir}/GazetteerETL/lib/commons-codec.jar" /-->
</target>
<!-- set the proxy needed to fetch the source gazetteer data -->
<target name="setProxy">
<setproxy proxyhost="${proxy.host}" proxyport="${proxy.port}" />
</target>
<!-- Fetch original source gazetteers from their various websites, unzip and rename (removing date from file name) -->
<!-- NOTE: make sure proxies are set if needed-->
<target name="data.nga">
<echo> Retrieving Data from NGA </echo>
<!--NGA world Gazetteer -->
<!-- NGA changed file naming convention in 2013 -->
<!-- property name="NGA_basename_2012" value="geonames_dd_dms_date_${NGA_date}" /-->
<property name="NGA_basename" value="geonames_${NGA_date}" />
<!-- alternate ftp path to the same data but way, way slower, Don't use unless desperate-->
<!--get src="ftp://ftp.nga.mil/pub2/gns_data/${NGA_basename}.zip" dest="${basedir}/GazetteerETL/GeoData/NGA" /-->
<get src="http://geonames.nga.mil/gns/html/cntyfile/${NGA_basename}.zip" dest="${basedir}/GazetteerETL/GeoData/NGA" />
<unzip src="${basedir}/GazetteerETL/GeoData/NGA/${NGA_basename}.zip" dest="${basedir}/GazetteerETL/GeoData/NGA" />
<!--move file="${basedir}/GazetteerETL/GeoData/NGA/${NGA_basename}.txt" tofile="${basedir}/GazetteerETL/GeoData/NGA/geonames_dd_dms_date.txt" /-->
</target>
<target name="data.usgs">
<echo> Retrieving Data from USGS </echo>
<!-- The three parts of the USGS gazetteer -->
<property name="USGS_basename_Nat" value="NationalFile_${USGS_date}" />
<property name="USGS_basename_Gov" value="GOVT_UNITS_${USGS_date}" />
<property name="USGS_basename_All" value="AllNames_${USGS_date}" />
<get src="http://geonames.usgs.gov/docs/stategaz/${USGS_basename_Nat}.zip" dest="${basedir}/GazetteerETL/GeoData/USGS" />
<get src="http://geonames.usgs.gov/docs/stategaz/${USGS_basename_Gov}.zip" dest="${basedir}/GazetteerETL/GeoData/USGS" />
<get src="http://geonames.usgs.gov/docs/stategaz/${USGS_basename_All}.zip" dest="${basedir}/GazetteerETL/GeoData/USGS" />
<unzip src="${basedir}/GazetteerETL/GeoData/USGS/${USGS_basename_Nat}.zip" dest="${basedir}/GazetteerETL/GeoData/USGS" />
<unzip src="${basedir}/GazetteerETL/GeoData/USGS/${USGS_basename_Gov}.zip" dest="${basedir}/GazetteerETL/GeoData/USGS" />
<unzip src="${basedir}/GazetteerETL/GeoData/USGS/${USGS_basename_All}.zip" dest="${basedir}/GazetteerETL/GeoData/USGS" />
<move file="${basedir}/GazetteerETL/GeoData/USGS/${USGS_basename_Nat}.txt" tofile="${basedir}/GazetteerETL/GeoData/USGS/NationalFile.txt" />
<move file="${basedir}/GazetteerETL/GeoData/USGS/${USGS_basename_Gov}.txt" tofile="${basedir}/GazetteerETL/GeoData/USGS/GOVT_UNITS.txt" />
<move file="${basedir}/GazetteerETL/GeoData/USGS/${USGS_basename_All}.txt" tofile="${basedir}/GazetteerETL/GeoData/USGS/AllNames.txt" />
</target>
<target name="getOriginalSources" depends="setProxy,data.nga,data.usgs">
<echo> Finished Retrieving Data Sources</echo>
</target>
<target name="build" depends="clean,prepare,copy-libs">
<antcall target="getOriginalSources" />
<!-- only one of the following targets will run, depending on OS -->
<antcall target="proc-gaz-win" />
<antcall target="proc-gaz-linux" />
</target>
<!-- run the overall Gazetteer cleaning/transformation process in Kettle -->
<!-- this produces the MergedGazetteer.txt gazetteer file which contains the cleaned and merged gazetteer
data ready to be ingested into the Solr gazetteer -->
<!-- win flavor of kettle call -->
<target name="proc-gaz-win" if="isWindows" depends="prepare">
<exec executable="cmd" spawn="false">
<env key="PENTAHO_DI_JAVA_OPTIONS" value="${kettle.options.jvm}" />
<arg value="/c" />
<arg value="${kitchen.script.win}" />
<arg value="/file" />
<arg value="${processGazetteer.script}" />
<arg value="/logfile" />
<arg value="${basedir}/GazetteerETL/Logs/etl.log.txt" />
<arg value="/level" />
<arg value="Minimal" />
</exec>
</target>
<!-- linux flavor of kettle call -->
<target name="proc-gaz-linux" if="isUnix" depends="prepare">
<echo>Launching ${kitchen.script.unix}</echo>
<echo>with options ${kettle.options.jvm} on kettle job ${processGazetteer.script}</echo>
<exec executable="${kitchen.script.unix}" spawn="false">
<env key="PENTAHO_DI_JAVA_OPTIONS" value="${kettle.options.jvm}" />
<arg value="-file" />
<arg value="${processGazetteer.script}" />
<arg value="-logfile" />
<arg value="${basedir}/GazetteerETL/Logs/etl.log.txt" />
<arg value="-level" />
<arg value="Minimal" />
</exec>
</target>
<!-- zip the generated gazetteer data -->
<target name="zip" depends = "build">
<zip destfile="${build.dir}/${publish.gaz.name}">
<fileset file="${outputDir}/MergedGazetteer.txt" />
<fileset file="${outputDir}/MergedGazetteer_SMALL.txt" />
<fileset file="${resourceDir}/country-names-2013.csv" />
<fileset file="${resourceDir}/feature-metadata-2013.csv" />
</zip>
</target>
</project>