public-transport · derhuerst · Mar 22, 2021
diff --git a/readme.md b/readme.md
@@ -60,7 +60,7 @@ mlr --csv sort -f service_id,date calendar_dates.txt | sponge calendar_dates.txt
 mlr --csv sort -f trip_id,start_time frequencies.txt | sponge frequencies.txt
 ```
 
-There's also a [`sort.sh` script](sort.sh) included in the npm package, which executes the commands above.
+There's also a [`sort.sh` script](sort.sh) included in the npm package, which executes the commands above. To execute it, you need [Miller](https://miller.readthedocs.io/) (`mlr`), [`sponge`](https://linux.die.net/man/1/sponge) (from moreutils), (GNU) [`sort`](https://linux.die.net/man/1/sort) and [`awk`](https://linux.die.net/man/1/awk) to be installed.
 
 *Note:* For read-only sources (like HTTP requests), sorting the files is not an option. You can solve this by [spawning](https://nodejs.org/docs/latest-v12.x/api/child_process.html#child_process_child_process_spawn_command_args_options) `mlr` and piping data through it.
 

diff --git a/sort.sh b/sort.sh
@@ -14,7 +14,23 @@ sort agency.csv -f agency_id
 sort stops.csv -f stop_id
 sort routes.csv -f route_id
 sort trips.csv -f trip_id
-sort stop_times.csv -f trip_id -n stop_sequence
+
+# Miller uses too much memory sorting large files, so we bend over backwards
+# here and use GNU sort, which doesn't handle `"`-escaped values. First, we
+# move the stop_sequence and trip_id columns to the front in order to minimize
+# the likelihood of (escaped) values that contain the `,` delimiter confusing
+# sort. This is quite ugly, find sth better, see #34.
+set +e
+sort=$(command -v gsort)
+if [ $? -ne 0 ]; then sort=$(command -v sort); fi
+set -e
+header="$(head -n 2 stop_times.csv | mlr --csv reorder -f trip_id,stop_sequence | head -n 1)"
+2>&1 echo "mlr --csv reorder -f trip_id,stop_sequence stop_times | $sort -t, -s -k2,2 -k1,1n"
+mlr --csv --headerless-csv-output reorder -f trip_id,stop_sequence stop_times.csv \
+	| $sort -t, -s -k2,2 -k1,1n \
+	| awk -v "header=$header" 'BEGIN{print header}{print $1}' \
+	| sponge stop_times.csv
+
 sort calendar.csv -f service_id
 sort calendar_dates.csv -f service_id,date
 # todo: sort start_time properly (it may be HH:MM:SS or H:MM:SS)