orma · mileswwatkins · Nov 2, 2017 · Sep 14, 2017 · Sep 14, 2017 · Sep 14, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.DS_Store
+node_modules
+**/*/.DS_Store
+data
diff --git a/.nvmrc b/.nvmrc
@@ -0,0 +1 @@
+6.0
diff --git a/README.md b/README.md
@@ -1 +1,37 @@
 # openroads-vn-boundaries
+
+a pipeline to take vietnam admin unit shapefiles and insert them as postgis tables into openroads-vn-api's database
+
+### install
+
+#### node packages
+`$ yarn install`
+
+#### s3 cli
+`$ pip install awscli`
+
+### docker
+[mac](https://docs.docker.com/docker-for-mac/install/#where-to-go-next)
+
+[pc](https://docs.docker.com/docker-for-windows/install/)
+
+#### data
+
+data is downloaded from an s3 bucket. There no need to add input data as it is added while the pipeline is running.
+
+#### database
+
+add a file `./db/local/index.js` of the following spec
+
+```javascript
+module.exports = {
+  connection: {
+    development: `development.db.url`,
+    production:  `production.db.url`
+  }
+}
+```
+
+# run
+
+`yarn start`
diff --git a/admin-tables-pipe.sh b/admin-tables-pipe.sh
@@ -0,0 +1,74 @@
+# Synopysis: links a set of I/O geoprocessing scripts that transform a commune level shapefile of Vietnam admin areas
+###########  into postgis table that includes data from commune, district, and commune level admin areas
+
+
+# input directory that holds to which the initial shapefiles are downloaded from s3
+INPUT_DIR=./data/input
+# output directory that holds the final output of linked processes
+OUT_DIR=./data/output
+# the base processing directory that includes sub directories that I/O data for each process
+PROCESSING_BASE_DIR=./data/processing
+# a special directory used to handoff data between each process
+HNDF_DIR=./data/handoff
+
+# delete handoff or process directories from previous runs that may have errored.
+rm -rf ${HNDF_DIR}
+rm -rf ${PROCESSING_BASE_DIR}
+
+# make handoff and process directories for current pipeline run
+mkdir -p ${PROCESSING_BASE_DIR}
+mkdir -p ${HNDF_DIR}
+mkdir -p ${INPUT_DIR}
+
+sh source.sh ${INPUT_DIR}
+
+# make directories in ${PROCESSING_BASE_DIR} for each process's I/O these process scripts live in ./processing
+for FILE in ./processing/*
+do
+  # get base filename from its path to generate the process's ${PROCESS_DIR} IN ${PROCESS_BASE_DIR}
+  FILEBASE=${FILE##*/}
+  FILESPLIT=(${FILEBASE//./ })
+  FILENAME=${FILESPLIT[0]}
+  PROCESS_DIR=${PROCESSING_BASE_DIR}/${FILENAME}
+  # make process dir
+  mkdir ${PROCESS_DIR}
+  # IN ${PROCESS_DIR} generate the input, tmp, and output ${PROCESS_SUBDIR}s needed to handle process specific I/O
+  for SUBDIR in input tmp output
+  do
+    PROCESS_SUBDIR=${PROCESS_DIR}/${SUBDIR}
+    mkdir ${PROCESS_SUBDIR}
+    # if the current ${PROCESS_SUBDIR} is input, and the process is the first dissolve process, copy the pipeline's only input, the commune shapefile, into it
+    if [[ $SUBDIR == *"input"* ]]
+    then
+      if [[ $PROCESS_SUBDIR == *"dissolve"* ]]
+      then
+        cp -R ./data/input/ ${PROCESS_SUBDIR}/
+      fi
+    fi
+  done
+  # for all processes except the first dissolve process, first copy the data inside the ${HNDF_DIR} into the process's input dir, then delete that process's content from handoff
+  # the reason for removal is to make sure only proper files exist there as some process scripts read in all of input and not files of a specific nomenclature
+  if [[ $PROCESS_SUBDIR != *"dissolve"* ]]
+  then
+    cp -R ${HNDF_DIR}/. ${PROCESS_DIR}/input/
+    rm -f  ${HNDF_DIR}/*
+  fi
+  # move input data to process's tmp dir so that any pipeline process errors allow for original input to be inspected.
+  cp -R ${PROCESS_DIR}/input/. ${PROCESS_DIR}/tmp/
+
+  # run process with command specific to if it is a shell process or javascript process
+  echo --- running ${FILENAME} ---
+  if [[ $FILE == *".sh"* ]]
+  then
+    ${FILE} ${PROCESS_DIR}
+  else
+    node ${FILE} ${PROCESS_DIR}
+  fi
+  # copy output contents to handoff directory for the next process to grab
+  cp -R ${PROCESS_DIR}/output/. ${HNDF_DIR}/
+done
+# clean up temp directories and remove the input data
+# rm -rf ${HNDF_DIR}
+# rm -rf ${PROCESSING_BASE_DIR}
+# rm -R ${INPUT_DIR}
+
diff --git a/data/input/vietnam-communes.cpg b/data/input/vietnam-communes.cpg
diff --git a/data/input/vietnam-communes.dbf b/data/input/vietnam-communes.dbf
diff --git a/data/input/vietnam-communes.prj b/data/input/vietnam-communes.prj
diff --git a/data/input/vietnam-communes.qpj b/data/input/vietnam-communes.qpj
diff --git a/data/input/vietnam-communes.shp b/data/input/vietnam-communes.shp
diff --git a/data/input/vietnam-communes.shx b/data/input/vietnam-communes.shx
diff --git a/db/connection.js b/db/connection.js
@@ -0,0 +1,23 @@
+'use strict';
+var assert = require('assert');
+
+// set the db urls based on environment
+var DEFAULT_ENVIRONMENT = 'development';
+var environment = process.env.ORMA_ENV || DEFAULT_ENVIRONMENT;
+var connection = process.env.DATABASE_URL || require('./local').connection[environment];
+
+assert.ok(connection, 'Connection is undefined; check DATABASE_URL or local.js');
+
+// connect knex to the current env's db.
+var knex = require('knex')({
+  client: 'pg',
+  connection: connection,
+  debug: false,
+  pool: {
+    min: 2,
+    max: 10
+  },
+  acquireConnectionTimeout: 100000
+});
+
+module.exports = knex;
diff --git a/db/local/index.js b/db/local/index.js
@@ -0,0 +1,7 @@
+module.exports = {
+  connection: {
+    'development': '',
+    'staging': '',
+    'production': ''
+  }
+}
diff --git a/docker/delete-holes/Dockerfile b/docker/delete-holes/Dockerfile
@@ -0,0 +1,9 @@
+FROM nuest/qgis-model:xenial-multimodel
+RUN apt-get update
+COPY ./delete-holes.py /workspace/delete-holes.py
+COPY ./join-geojsons.js /workspace/join-geojsons.js
+COPY ./main.sh /workspace/main.sh
+COPY ./vietnam-district.geojson /workspace/vietnam-district.geojson
+COPY ./vietnam-province.geojson /workspace/vietnam-province.geojson
+# on entry into container, run the run.sh script
+ENTRYPOINT ["/bin/bash", "/workspace/main.sh"]
diff --git a/docker/delete-holes/delete-holes.py b/docker/delete-holes/delete-holes.py
@@ -0,0 +1,23 @@
+# Synopysis: cleans admin geometries using the grass gis v.clean alg available with qgis install
+# sys is used primarily for adding qgis utils to path
+import sys
+import os
+# the following qgis modules import and order reference the following
+# https://github.com/nuest/docker-qgis-model/blob/master/workspace/example/model.py#L20
+from qgis.core import *
+import qgis.utils
+# to use processing script a qgis app needs to be initialized
+app = QgsApplication([], True)
+QgsApplication.setPrefixPath('/usr', True)
+QgsApplication.initQgis()
+# append processing plugin to system path
+sys.path.append('/usr/share/qgis/python/plugins')
+# import, then initalize the processing pobject
+from processing.core.Processing import Processing
+Processing.initialize()
+import processing
+# set path to inputs and outputs
+input_communes = os.path.join(os.getcwd(), sys.argv[1])
+output_communes = os.path.join(os.getcwd(), sys.argv[2] + '.geojson')
+# clean the geometries
+processing.runalg('qgis:fillholes',input_communes, 100000, output_communes)
diff --git a/docker/delete-holes/join-geojsons.js b/docker/delete-holes/join-geojsons.js
@@ -0,0 +1,46 @@
+var geojsonStream = require('geojson-stream')
+var readFile = require('fs').readFile;
+var through2 = require('through2');
+var createReadStream = require('fs').createReadStream;
+var _ = require('underscore');
+
+/**
+ * reads in feature collection, then (in a stream) joins it to a different fc
+ * @param {FeatureCollection} fc feature collection we are joining (to another fc)
+ * @param {function} cb a callback!
+ */
+function joiner (fc, cb) {
+  readFile(fc, 'utf8', (err, res) => {
+    if (err) { throw err }
+    cb(null, JSON.parse(res));
+  });
+}
+
+joiner(process.argv[2], (err, fc) => {
+  // create obj w/keys === join field val
+  const joinField = process.argv[4];
+  // make sure index is a string
+  if (typeof fc.features[0].properties[joinField] !== 'string') { 
+    fc.features = fc.features.map((f) => { 
+      f.properties[joinField] = f.properties[joinField].toString(); 
+      return f;
+    }); 
+  };
+  const joiningIndex = _.indexBy(fc.features.map(f => f.properties), joinField);
+  // stream read fc to join, joining each match, then adding to features.
+  createReadStream(process.argv[3])
+  .pipe(geojsonStream.parse())
+  .pipe(through2.obj((feature, _, callback) => {
+    // see if joiningIndex includes match and if so pipe it through. 
+    var toJoinVal = feature.properties[joinField];
+    // only join on matches
+    if (toJoinVal) {     
+      if (typeof toJoinVal !== 'string') { toJoinVal = toJoinVal.toString(); }
+      var joinableVal = joiningIndex[toJoinVal];
+      feature.properties = Object.assign(feature.properties, joinableVal);
+    }
+    callback(null, feature)
+  }))
+  .pipe(geojsonStream.stringify())
+  .pipe(process.stdout)
+})
diff --git a/docker/delete-holes/main.sh b/docker/delete-holes/main.sh
@@ -0,0 +1,17 @@
+# run qgis process in python while keeping qgis 'headless', or put otherwise access and use qgis processing modules
+# without running the qgis gui
+cd /workspace
+# down node/npm, link them, then setup package w/dependencies for join-geojsons.js
+apt-get install -qy nodejs
+apt-get install -qy npm
+ln -s /usr/bin/nodejs /usr/bin/node
+npm init -y
+npm install geojson-stream fs through2 underscore
+# make geojson with just id and geojson for cleaning. reason for this is the whole deletion seems to be messing with the vietnamese unicode
+ogr2ogr -f 'GeoJSON' vietnam-province-id.geojson vietnam-province.geojson -select PROCODE02
+ogr2ogr -f 'GeoJSON' vietnam-district-id.geojson vietnam-district.geojson -select DISTCODE02
+xvfb-run -a python delete-holes.py vietnam-province-id.geojson vietnam-province-filled
+xvfb-run -a python delete-holes.py vietnam-district-id.geojson vietnam-district-filled
+node join-geojsons.js vietnam-province.geojson vietnam-province-filled.geojson PROCODE02 > vietnam-province-filled-holes.geojson 
+node join-geojsons.js vietnam-district.geojson vietnam-district-filled.geojson PROCODE02 > vietnam-district-filled-holes.geojson
+
diff --git a/docker/dissolve/Dockerfile b/docker/dissolve/Dockerfile
@@ -0,0 +1,15 @@
+# Synopsys: Dockerfile that builds an image that includes gdal 
+# uses https://github.com/geo-data/gdal-docker
+
+FROM geodata/gdal
+# make a workspace directory
+RUN mkdir -p /workspace/
+# copy over python scripts and initial data.
+COPY ./vietnam-communes.dbf /workspace/vietnam-communes.dbf
+COPY ./vietnam-communes.prj /workspace/vietnam-communes.prj
+COPY ./vietnam-communes.qpj /workspace/vietnam-communes.qpj
+COPY ./vietnam-communes.shp /workspace/vietnam-communes.shp
+COPY ./vietnam-communes.shx /workspace/vietnam-communes.shx
+COPY ./main.sh /workspace/main.sh
+# on entry into container, run the main.sh script
+ENTRYPOINT ["/bin/bash", "/workspace/main.sh"]
diff --git a/docker/dissolve/main.sh b/docker/dissolve/main.sh
@@ -0,0 +1,24 @@
+# cd to workspace dir
+cd /workspace
+INPUT=./vietnam-communes.shp
+INPUT_NAME=vietnam-communes
+
+for ADMIN in 'communes;COMCODE02' 'district;DISTCODE02' 'province;PROCODE02'
+do
+  # split ${ADMIN} string on the semi-colon to grab the admin name and field id
+  ADMIN_ARRAY=(${ADMIN//;/ })
+  # make the unique output file per the current admin name
+  OUTPUT=./vietnam-${ADMIN_ARRAY[0]}.geojson
+  # make ${DISSOLVE_FIELD} per the current admin's dissolve field
+  DISSOLVE_FIELD=${ADMIN_ARRAY[1]}
+  # dissolve on admin field with ogr2ogr and write output as a geojson; also reproject from UTM to wgs84
+  # this comman creates a new geojson where features are geometries that share the same ${DISSOLVE_FIELD}
+  # 'ST_UNION' merges geometries.
+  # 'GROUP BY' tells gdal which gemetries to merge together
+  # -t_srs is a flag for reprojection
+  # EPSG:4326 is the WGS84 EPSG code
+  # http://spatialreference.org/ref/epsg/wgs-84/
+  # echo ${QUERY}
+  ogr2ogr -t_srs EPSG:4326 -f 'GeoJSON' "${OUTPUT}" "${INPUT}" -dialect sqlite -sql $'SELECT ST_Union(geometry), * FROM "'"$INPUT_NAME"$'" GROUP BY '"$DISSOLVE_FIELD"
+done
+
diff --git a/docker/update-geojson-spec/Dockerfile b/docker/update-geojson-spec/Dockerfile
@@ -0,0 +1,12 @@
+# Synopsys: Dockerfile that builds an image that includes gdal 
+# uses https://github.com/geo-data/gdal-docker
+
+FROM node:6.11-slim
+# istall geojson rewind
+RUN npm install -g geojson-rewind
+RUN mkdir -p /workspace/
+COPY ./main.sh /workspace/main.sh
+COPY ./vietnam-communes-filled-holes.geojson /workspace/vietnam-communes-filled-holes.geojson
+COPY ./vietnam-district-filled-holes.geojson /workspace/vietnam-district-filled-holes.geojson
+COPY ./vietnam-province-filled-holes.geojson /workspace/vietnam-province-filled-holes.geojson
+ENTRYPOINT ["bin/bash", "/workspace/main.sh"] 
diff --git a/docker/update-geojson-spec/main.sh b/docker/update-geojson-spec/main.sh
@@ -0,0 +1,17 @@
+# cd to workspace dir
+cd /workspace
+
+for ADMIN in communes district province
+do
+  # generate unique input and output files as it has been done in previous examples
+  INPUT_FILE=./vietnam-${ADMIN}-filled-holes.geojson
+  OUTPUT_FILE=./vietnam-${ADMIN}-cleaned.geojson
+  # remove crs object to match current GeoJSON spec using sed.
+  # the below command was found in following place
+  # https://stackoverflow.com/questions/38028600/how-to-delete-a-json-object-from-json-file-by-sed-command-in-bash (see the mailer example)
+  # the `-i .org` allows inplace convserion so the ${INPUT_FILE} effectively has its crs removed.
+  # sed -e '/\"crs\"/ d; /^$/d' ${INPUT_FILE} > ${INPUT_FILE} 
+  # geojson-rewind winds left-to right wound geojsons right-to-left. the right-to-left output is saved to ${OUTPUT_FILE}
+  geojson-rewind ${INPUT_FILE} > ${OUTPUT_FILE}
+done
+