diff --git a/readme.md b/readme.md index 1e9b48e..05ed129 100644 --- a/readme.md +++ b/readme.md @@ -60,7 +60,7 @@ mlr --csv sort -f service_id,date calendar_dates.txt | sponge calendar_dates.txt mlr --csv sort -f trip_id,start_time frequencies.txt | sponge frequencies.txt ``` -There's also a [`sort.sh` script](sort.sh) included in the npm package, which executes the commands above. +There's also a [`sort.sh` script](sort.sh) included in the npm package, which executes the commands above. To execute it, you need [Miller](https://miller.readthedocs.io/) (`mlr`), [`sponge`](https://linux.die.net/man/1/sponge) (from moreutils), (GNU) [`sort`](https://linux.die.net/man/1/sort) and [`awk`](https://linux.die.net/man/1/awk) to be installed. *Note:* For read-only sources (like HTTP requests), sorting the files is not an option. You can solve this by [spawning](https://nodejs.org/docs/latest-v12.x/api/child_process.html#child_process_child_process_spawn_command_args_options) `mlr` and piping data through it. diff --git a/sort.sh b/sort.sh index 8e79d72..929c4f6 100755 --- a/sort.sh +++ b/sort.sh @@ -14,7 +14,23 @@ sort agency.csv -f agency_id sort stops.csv -f stop_id sort routes.csv -f route_id sort trips.csv -f trip_id -sort stop_times.csv -f trip_id -n stop_sequence + +# Miller uses too much memory sorting large files, so we bend over backwards +# here and use GNU sort, which doesn't handle `"`-escaped values. First, we +# move the stop_sequence and trip_id columns to the front in order to minimize +# the likelihood of (escaped) values that contain the `,` delimiter confusing +# sort. This is quite ugly, find sth better, see #34. +set +e +sort=$(command -v gsort) +if [ $? -ne 0 ]; then sort=$(command -v sort); fi +set -e +header="$(head -n 2 stop_times.csv | mlr --csv reorder -f trip_id,stop_sequence | head -n 1)" +2>&1 echo "mlr --csv reorder -f trip_id,stop_sequence stop_times | $sort -t, -s -k2,2 -k1,1n" +mlr --csv --headerless-csv-output reorder -f trip_id,stop_sequence stop_times.csv \ + | $sort -t, -s -k2,2 -k1,1n \ + | awk -v "header=$header" 'BEGIN{print header}{print $1}' \ + | sponge stop_times.csv + sort calendar.csv -f service_id sort calendar_dates.csv -f service_id,date # todo: sort start_time properly (it may be HH:MM:SS or H:MM:SS)