-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmlp
executable file
·979 lines (888 loc) · 28.9 KB
/
xmlp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
#!/bin/bash
gVersion=0.21
gStamped=230902
#
# xmlp (c) 2020-23 Greg Kerr, Akua Co.
#
# License: Public Domain
#
# History
# 021 GLK 230902 Failed to wget reddit.com - made curl and links higher priority, added WRE for referer (probably not needed)
# 020 GLK 230610 Merge in YML start from Rock
# 019 GLK 230516 Treat tr tags as array if encountered while parsing XML so -he can convert a table snippet
# 018 GLK 230327 EQC for = symbol
# 017 GLK 230320 Some bug in comments with -> in them causes text until over-next --> to be included in comment, fix Usage
# 016 GLK 230313 Support wrapped tags in comments - e.g. <!-- <disabled>Something</disabled> -->
# 015 GLK 230312 CompressTag on DKD starting paths based on DKD* instead of DDD* (Dict/Key items compress based on Dict, not path)
# 014 GLK 230311 -V option for KVO format - bash executable key value output
# 013 GLK 230310 AutoIndex option, feature
# 012 GLK 230309 Better comments handling
# 011 GLK 230306 If no < in a file, and plutil available, auto convert binary plist to plist
# 010 GLK 230113 Document -a option, handle comments (multi-line values still a propblem)
# 009 GLK 221224 Forget when I made last change, but root=feed coming out on top of search for jpgs in rss
# 008 GLK 210207 Start to create new compact format with tabs instead of . and array/dict first item on next line
# 007 GLK 201213 Support -e for current behaviour, compress paths with . matching parent component
# 006 GLK 201124 Support array indexing on by default, -a to toggle
# 005 GLK 201121 Support multiple inputs and file or URL as argument
# 004 GLK 201120 Show Usage if no stdin and no args
# 003 GLK 200823 Add -f flag to return after first match; add -k flag to flatten key/value couplets into /key/name/type:value lines
# 002 GLK 200817 Add -n flag to get URL for JPG in RSS feeds from Reddit
# 001 GLK 200711 Fix <br/> issue - I think fixed on another machine already
# Todo
# 230312 DONE Support > in comments
# 230311 DONE Support CSV output
# 230310 DONE Change ':' to something else as : is common in rss feed tags
# Compact format
# Key=value
# Key uncompressed
# / -> child element
# ~ -> child singlet (no further children, but perhaps properties)
# . -> property
# #x -> Array element #x
# $DTD -> dict member
# Key compressed first character
# / Top level path
# : Top level <key>name</key> in a dict (:name)
# ~ Top level singlet
# - backup a level
# XML Format
# Key preceded by number of tabs as prior key paths
# Static
WUA='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
WRE='http://www.akua.com/refer-me-to-your-leader/'
# Globals config defaults - settable by cl options
DEBUG=0 # Noisy?
RAWCONTENT=0 # Process html content tags as XML ... scale 0 to 2
IGNORELAYOUT=1 # Ignore table/div/span constructs in html
IGNOREBLANK=1 # Don't dump empty items
IGNOREXML=0 # Handle HTML?
ONLYFIRST=0 # Return after finding first match
FLATKEY=1 # Turn dict <key>A</key><string>s</string> into path/A/string/s
ARRAYNUMS=1 # Turn # for array items into #idx where idx increments each array item
MATCH='' # Only print matching path
SUFFIX='' # Only print if value has this suffix
NEXTS=0 # Number of MATCH lines to print after SUFFIX is hit (e.g. to include following html line after jpg matches)
NEXTC=0 # This is set to NEXTS when SUFFIX matches
ONLYVALUE=0 # No tag path: prefix
EXPAND=0 # Expanded full paths per line vs . per matching component
OUTXML=0 # Output XML again (after edits presumably)
OUTYML=0 # Output streaming YAML
MULTILINE=0 # Convert new line to $NEWLINE in values (happens for all comments)
NEWLINE=';;' # Convert \n to this if MULTILINE -ne 0
READRAW=0 # Just dump tag reading
AUTOIDX=0 # Detect array of repeating structures and apply index
OUTKVO=0 # Output KVO form (executable by bash)
OUTCSV=0 # Output CSV format (may be crazy)
# Global current tag in process
ROOTELEM='' # ROOTELEM - first element after <?xml, only once per doc - we start parsing here
ENTITY='' # TAG[ prop1=2][ prop3=4][/]
CONTENT='' # Text between <TAG ...> and </TAG>
TAG='' # TAG of entity e.g. <TAG prop=1 prop2=3>, case sensitive by XML specification
CMT='' # Parsed comment tag
CMTN=0 # Comment index (for identification)
PROP='' # properties of TAG
SINGLET=0 # Is the tag a singlet (ends with /> like img, link)
LASTWP='' # Last tag output (to compress path)
LASTDEEP=0 # Last depth output (new compression)
ARRAYIDX=0 # In nth number of array
DEPTH=0 # How deep into the hierarchy are we?
CDEEP=0 # How deep into the content hierarchy are we?
CWP='' # Current working path of parse (should start and end with #), SINGLETs don't effect, <tag> pushes, </tag> pops
CCP='' # Current content tag path
SKIPLINE=0 # Skipping current line - don't print Fields
AUTOX='' # Auto Index Counters
AUTON=0 # Current Auto Index
PRIOR='' # Prior tag (can change behavior - e.g. 'key' prior makes IGNOREBLANK ignored for next)
PRION='' # Prior popped tag (came out of)
QUOTE='' # Place before and after value outputs
# Document Global
HASARRAY=0 # Encountered array (for CSV HTML table vs CVS xml) - does not work because of pipe
# Delimiters
SUD='/' # Subelement Delimiter
DKD='%' # Dict/Key compressed delimiter - can not be %
SND='~' # Singlet delimiter
PRD='.' # Property delimiter
DDD="[$SUD$DKD$SND$PRD]" # For path compression Path delimiters (LASTWP)
IXD='^' # Index delimiter - within a tag
DAD='.' # Path compression, use prior path - last [DDD] component + rest after DAD
EQC='=' # Key=Value separator
Msg()
{
local t
[ "$1" = '-' ] && shift || ([ -n "$EPOCHSECONDS" ] && t="$EPOCHSECONDS" || t=$(date '+%s'); t="$((1000 + ($t % 3600))): ")
echo "$t$@" >&2
}
InfoMsg() { [ $DEBUG -gt 0 ] && Msg "$@"; }
DbgMsg() { local msg; [ $DEBUG -gt 1 ] && msg="$1" && shift && Msg "====> $msg" "$@"; }
AutoEnter()
{
local p d n
p="${AUTOX#:$1:}"
if [ "$p" = "$AUTOX" ]; then
AUTOX="${AUTOX}:$1:$2:1:"
AUTON=1
else
d="${p%%:*}"; p="${p#*:}"
n="${p%%:*}" p="${p#*:}"
n=$((1 + $n))
AUTOX="${AUTOX%%:$1:*:*:}:$1:$2:$n:${AUTOX##:$1:*:*:}"
DbgMsg "AE: $AUTOX [$n]"
AUTON=$n
fi
}
AutoLeave()
{
DbgMsg "Nothing yet"
}
ReadRaw()
{
local IFS=\>
DbgMsg 'RR'
if read -r -d \< -t 33 ENTITY CONTENT; then
# DbgMsg "E: $ENTITY"
# DbgMsg "C: $CONTENT"
CMT="${ENTITY#!--}" # Comment also single tag element - only support comments that have no '>' in them (terminator)
CONTENT="${CONTENT%"${CONTENT##*[![:space:]]}"}" # Remove trailing whitespace characters
if [ "$CMT" != "$ENTITY" ]; then
[ -n "$CONTENT" ] && CMT="${ENTITY}>${CONTENT%>}"
# Get rest of comment up to -->
while [ "$CMT" = "${CMT%--}" ]; do
DbgMsg "C: $CMT"
if read -r -d \< -t 33 TAG; then
DbgMsg "D: $TAG"
TAG="${TAG%"${TAG##*[![:space:]]}"}" # Remove trailing whitespace characters
[ "$TAG" != "${TAG%-->}" ] && TAG="${TAG%>}" # At end of comment?
CMT="${CMT}<$TAG"
else
CMT="${CMT}--"
fi
done
PROP="${CMT%--}"
TAG='!--'
else
CONTENT="${CONTENT#"${CONTENT%%[![:space:]]*}"}" # Remove leading whitespace characters
ENTITY="${ENTITY%"${ENTITY##*[![:space:]]}"}" # Remove trailing whitespace characters
SINGLET=1
TAG="${ENTITY%/}" # Single tag element? (ends with / or starts with !)
[ "$TAG" = "$ENTITY" ] && SINGLET=0 || TAG="${TAG%"${TAG##*[![:space:]]}"}" # Remove trailing whitespace characters
PROP="${TAG#*[[:space:]]}" # Break out tag and properties (\t \n \v \f \r " "), blank is space,tab
PROP="${PROP%[[:space:]]}" # Dangling space in singlet
TAG="${TAG%%[[:space:]]*}" # Space separator after TAG?
fi
DbgMsg "e: $ENTITY"
DbgMsg "c: $CONTENT"
DbgMsg "t: $TAG [$SINGLET]"
DbgMsg "p: $PROP"
return 0
fi
return 1
}
ReadTag()
{
# Split words at >, split lines at <
# example BODY a=b c="Some text">Test<NextTag> -> ENTITY = BODY ... text", CONTENT = Test, pos at NextTag>; TAG=BODY
local IFS=\>
# -d delimiter, -r backslash not a shell char, -t timeout in seconds
if read -r -d \< -t 33 ENTITY CONTENT; then
CMT="${ENTITY#!--}" # Comment also single tag element - only support comments that have no '>' in them (terminator)
CONTENT="${CONTENT%"${CONTENT##*[![:space:]]}"}" # Remove trailing whitespace characters
if [ "$CMT" != "$ENTITY" ]; then
[ -n "$CONTENT" ] && CMT="${ENTITY}>${CONTENT%>}"
# Get rest of comment up to -->
while [ "$CMT" = "${CMT%--}" ]; do
if read -r -d \< -t 33 TAG; then
TAG="${TAG%"${TAG##*[![:space:]]}"}" # Remove trailing whitespace characters
[ "$TAG" != "${TAG%-->}" ] && TAG="${TAG%>}" # At end of comment?
CMT="${CMT}<$TAG"
else
CMT="${CMT}--"
fi
done
PROP="${CMT%--}"
TAG='!--'
else
CONTENT="${CONTENT#"${CONTENT%%[![:space:]]*}"}" # Remove leading whitespace characters
ENTITY="${ENTITY%"${ENTITY##*[![:space:]]}"}" # Remove trailing whitespace characters
SINGLET=1
TAG="${ENTITY%/}" # Single tag element? (ends with / or starts with !)
[ "$TAG" = "$ENTITY" ] && SINGLET=0 || TAG="${TAG%"${TAG##*[![:space:]]}"}" # Remove trailing whitespace characters
PROP="${TAG#*[[:space:]]}" # Break out tag and properties (\t \n \v \f \r " "), blank is space,tab
PROP="${PROP%[[:space:]]}" # Dangling space in singlet
TAG="${TAG%%[[:space:]]*}" # Space separator after TAG?
[ "$PROP" = "$TAG" ] && PROP=''
fi
# TAG (Element Name) by spec are case sensitive, but attributes not
# [ $BASH_VERSINFO -gt 3 ] && TAG="${TAG,,*}" || TAG=$(echo "$TAG" | tr '[:upper:]' '[:lower:]') # ... lower case all tags for match - fails on Darwin's bash 3.x - spec says case matters
DbgMsg "RTAG: $TAG [$PROP]$CONTENT[$SINGLET|$DEPTH|$CWP]"
return 0
fi
return 1
}
ReadXmlTag()
{
local aIdx match
local pTag="${CWP##*/}"
local pDic="${pTag#*${DKD}}"
local del="$SUD"
if ReadTag; then
if [ $SINGLET -eq 0 -a -n "$TAG" ]; then
# Convert dict & array to macro, join dict/key into ${DKD}keyname construct
if [ $FLATKEY -ne 0 ]; then case "$TAG" in
'key')
if [ $BASH_VERSINFO -gt 3 ]; then
CONTENT="${CONTENT//\//_}"
CONTENT="${CONTENT//:/+}"
else
CONTENT=$(echo "$CONTENT" | tr '/:' '_+')
fi
if [ "$pTag" = 'dict' ]; then # First level of dictionary
CWP="${CWP%[${SUD}]*}"
TAG="$CONTENT/key" # Dict name becomes Tag
CONTENT=''
del="$DKD"
elif [ "$pDic" != "$pTag" ]; then
CWP="${CWP%[${DKD}]*}"
TAG="$CONTENT/key"
CONTENT=''
del="$DKD"
else
TAG="$CONTENT/key"
CONTENT=''
fi
;;
'/array'|'/tr')
pDic="${pTag%#*}"
# Revert array number in path to array
[ "$pDic" != "$pTag" ] && CWP="${CWP%${SUD}#*}${SUD}${TAG#/}"
HASARRAY=1
;;
'/dict')
# Revert path from $DKD to being a dict
[ "$pDic" != "$pTag" ] && CWP="${CWP%[${DKD}]*}${SUD}dict"
;;
*)
case "$pDic" in
'array'|'tr')
AUTOIDX=0 # Have arrays, turn this off
ARRAYIDX=1
if [ $ARRAYNUMS -eq 0 ]; then
aIdx=''
# PrintTag "${CWP}" '-'
else
aIdx="$ARRAYIDX"
fi
CWP="${CWP%/*}${SUD}#$aIdx"
;;
'#'[0-9]*)
ARRAYIDX=$((1 + $ARRAYIDX))
if [ $ARRAYNUMS -eq 0 ]; then
aIdx=''
# PrintTag "${CWP}" '-'
else
aIdx="$ARRAYIDX"
fi
CWP="${CWP%${SUD}*}${SUD}#$aIdx"
;;
esac
;;
esac; fi
# Singlets are processed as a whole - so we are either going deep or popping up
if [ "$TAG" = '!--' ]; then # Comment? Don't change depth or CWP (Path)
DbgMsg "Comment: $PROP"
elif [ "${TAG#/}" = "$TAG" ]; then # Going deeper (not ending a block)
# Msg "-------------> Push $TAG onto ($CWP - $pDic - $PROP - $CONTENT)"
CWP="${CWP}${del}$TAG"
DEPTH=$((1 + $DEPTH))
# Msg "*** $PRION : $TAG ($PRIOR)"
[ $AUTOIDX -ne 0 -a "$PRION" = "/$TAG" ] && AutoEnter "$TAG" $DEPTH && CWP="${CWP}${IXD}$AUTON"
else
match="${CWP##*${SUD}}"
match="${match%${IXD}*}"
[ -z "$match" ] && match="$ROOTELEM"
[ "${TAG#/}" != "$match" ] && Msg "*** XML Tag mismatch at $CWP [$match] (${TAG#/} expected)"
# Msg "<------------ Pop $TAG from ($CWP - $CONTENT)"
DEPTH=$(($DEPTH - 1))
# If not compressing, must indicate end of record
[ $EXPAND -ne 0 -a $FLATKEY -eq 0 ] && PrintTag "$TAG" '.'
PRION="$TAG"
[ $DEPTH -eq 0 ] && CWP='' || CWP="${CWP%${SUD}*}"
TAG='' # Coming up
fi
fi
return 0
fi
return 1
}
ReadHtmlTag()
{
if ReadTag; then
case "$TAG" in
tr|td|table|div|span|br|p|/tr|/td|/table|/div|/span|/p)
[ $IGNORELAYOUT -ne 0 -a -z "$PROP" ] && TAG=''
;;
esac
if [ $SINGLET -eq 0 -a -n "$TAG" ]; then
if [ "${TAG#/}" = "$TAG" ]; then # Going deeper
CCP="${CCP}${SUD}$TAG"
CDEEP=$((1 + $CDEEP))
else
[ "${TAG#/}" != "${CCP##*${SUD}}" ] && Msg "*** HTML Tag mismatch at $CCP (${TAG#/} expected)"
CDEEP=$(($CDEEP - 1))
CCP="${CCP%${SUD}*}"
TAG='' # Coming up
fi
fi
return 0
fi
return 1
}
PrintLine()
{
# Print one line of output - check MATCH, ONLYVALUE, IGNOREBLANK parameters
local m tag val
tag="$1"
val="$2"
DbgMsg "LINE: $tag [$val] [$MATCH] [$SUFFIX] $IGNOREBLANK [$PRIOR]"
# Skip line if not a match
[ -n "$MATCH" -a "${tag#$MATCH}" = "$tag" ] && SKIPLINE=1 && return 0
SKIPLINE=0
[ $SINGLET -ne 0 -a "$PRIOR" = 'key' ] && val="$SND$TAG"
[ -n "$val" -o $IGNOREBLANK -eq 0 ] && PrintField "$tag" "$val"
return 0
}
PrintProps()
{
local i p v
local tag="$1"
shift
DbgMsg "PROP: [$tag] [$@] [$#]"
[ $SINGLET -ne 0 ] && tag="${tag}$SND${TAG}" && DbgMsg "SINGLET: $tag [$TAG]"
for i in "$@"; do
# DbgMsg "PROP: ($i) ($TAG) ($@)"
p="${i%%=*}" # Split property from value
v="${i#*=}"
v="${v#\"}" # Chop off quoting
v="${v%\"}"
v="${v#\'}"
v="${v%\'}"
PrintLine "$tag$PRD$p" "$v"
done
}
MultiLine()
{
# Echo input with /n replaced with $NEWLINE
local l line
while read l; do
line="${line}${NEWLINE}$l"
done
echo "${line#$NEWLINE}"
}
PrintTag()
{
# Print $1 tag's properties if any in $@
local tag
tag="$1"
[ -n "$2" ] && PROP="$2"
shift
[ -z "$tag" ] && tag="$CWP" && DbgMsg "TCWP: $tag [$PROP]"
if [ $OUTYML -eq 0 ]; then
DbgMsg "PTAG: [$tag] [$PROP] $IGNOREBLANK [$PRIOR]"
if [ "$TAG" = '!--' ]; then
PROP=$(echo "$PROP" | MultiLine)
PrintLine '#' "$PROP"
CMTN=$((1 + $CMTN))
else
[ $MULTILINE -ne 0 ] && PROP=$(echo "$PROP" | MultiLine)
if [ "$PROP" != "${PROP#*\&}" ]; then
# Msg "*** [$tag][$PROP]" # PROP can have '&'
HtmlUnraw "$PROP"; PROP="$OUT"
HtmlUnraw "$tag"; tag="$OUT"
# Msg "+++ [$tag][$PROP]" # PROP can have '&'
fi
eval PrintProps \"$tag\" $PROP # Need to get props as "$@" - for i in $PROP fails to break words properly (without eval, HTML parse failed on two word alt img tag)
[ $MULTILINE -ne 0 ] && CONTENT=$(echo "$CONTENT" | MultiLine)
PrintLine "$tag" "$CONTENT"
fi
elif [ $OUTXML -eq 0 ]; then
DbgMsg "PTAG: [$tag] [$PROP] $IGNOREBLANK [$PRIOR]"
if [ "$TAG" = '!--' ]; then
PROP=$(echo "$PROP" | MultiLine)
PrintLine "__$CMTN" "$PROP"
CMTN=$((1 + $CMTN))
else
[ $MULTILINE -ne 0 ] && PROP=$(echo "$PROP" | MultiLine)
# Got a spurious error on next line saying: amp: command not found
eval PrintProps \"$tag\" $PROP # Need to get props as "$@" - for i in $PROP fails to break words properly (without eval, HTML parse failed on two word alt img tag)
[ $MULTILINE -ne 0 ] && CONTENT=$(echo "$CONTENT" | MultiLine)
PrintLine "$tag" "$CONTENT"
fi
else
# Untested and makes no sense
if [ $SINGLET -ne 0 ]; then
PrintLine "<$TAG/>" || [ -z "$PROP" ] && PrintLine "${CWP}:" "$TAG"
else
PrintLine "<$TAG $PROP>$CONTENT</$TAG>" || PrintLine "$tag" "$CONTENT"
fi
fi
}
CompressTag()
{
# Compress tag path with prior output path if applicable
# Rules
# If array, drop left of array item path (${SUD}# delimiter)
# If key/dict, drop left of key/dict
# Else drop left of any of 4 path delimiters ($DDD) - array, keydict, item, singlet
local a
local t="$1"
local m=1 # Matched so far
DbgMsg "*** COMP: [$LASTWP][$t][$CWP]"
a="${t#$LASTWP}" # Try lopping off LastWP
if [ "${t#*${SUD}#}" != "$t" ]; then
LASTWP="${t%${SUD}#*}"
elif [ "${t#*%}" != "$t" ]; then
LASTWP="${t%${DKD}*}"
else
LASTWP="${t%${DDD}*}"
fi
[ "$a" != "$t" ] && t="${DAD}$a"
OUT="$t"
return 0
}
PrintField()
{
# Called from PrintLine
# Output "${1}White space separated items from $2"
local f p l tag n
tag="$1"
f="$2"
l=''
f="${f%"${f##*[![:space:]]}"}" # Remove trailing whitespace characters
while [ -n "$f" ]; do
f="${f#"${f%%[![:space:]]*}"}" # Remove leading whitespace characters
p="${f%%[[:space:]]*}" # Get first "field"
l="$l $p"
[ "$p" = "$f" ] && f='' || f="${f#*[[:space:]]}"
done
l="${l# }"
DbgMsg "FIELD: $tag [$2] [$p] [$l] [$EXPAND]"
if [ $SKIPLINE -eq 0 ]; then
if [ -n "$SUFFIX" ]; then
f="${l%$SUFFIX}"
if [ "$f" = "$l" ]; then # Suffix mismatch
[ $NEXTC -le 0 ] && return 0
NEXTC=$(($NEXTC - 1))
else
NEXTC="$NEXTS" # Suffix match, this many matching tags after this one will be included
fi
fi
# Compress some tag path items - e.g. dict/key to $DKD
if [ $OUTXML -eq 0 ]; then
if [ $EXPAND -eq 0 -a "$TAG" != '!--' ]; then
CompressTag "$tag"
tag="$OUT"
fi
if [ $OUTKVO -ne 0 ]; then
if [ $BASH_VERSINFO -gt 3 ]; then
tag="${tag//:/_}"
else
tag=$(echo "$tag" | sed -e 's/:/_/g')
fi
fi
if [ -n "$QUOTE" ]; then
if [ $BASH_VERSINFO -gt 3 ]; then
l="${l//$QUOTE/$QUOTE\\\\$QUOTE$QUOTE}"
else
l=$(echo "$l" | sed -e "s/$QUOTE/$QUOTE\\\\$QUOTE$QUOTE/g")
fi
fi
[ $ONLYVALUE -eq 0 ] && echo "${tag}${EQC}${QUOTE}${l}${QUOTE}" || echo "${QUOTE}$l${QUOTE}"
else
for ((l=$DEPTH; l>0; l--)); do printf "\t"; done
echo "<$tag>${l}</$tag>"
fi
[ $ONLYFIRST -eq 1 -a -n "$MATCH$SUFFIX" ] && exit 0 || ONLYFIRST=$(($ONLYFIRST - 1))
fi
}
FlattenXml()
{
CWP='' # Start at root of whomever called me
DEPTH=0
while ReadXmlTag; do if [ -n "$TAG" ]; then case "$TAG" in
'!--')
InfoMsg "CMT: $PROP"
[ -n "$PROP" ] && PrintTag '!--' "$PROP"
;;
'content')
DbgMsg "CONTENT ($RAWCONTENT): $CONTENT"
DoContentTag "$TAG" "$CONTENT"
ReadXmlTag || return 0 # kill /content
;;
*)
DbgMsg "READ: $TAG [$PROP] [$CONTENT] [$CWP] [$OUTXML]"
PrintTag
DbgMsg "RADE: $TAG [$PROP] [$CONTENT] [$CWP]"
;;
esac; [ "$TAG" != '!--' ] && PRIOR="${TAG##*/}"; fi; done
}
FlattenHtml()
{
local cwp="$CWP"
CCP=''
CDEEP=0
while ReadHtmlTag; do if [ -n "$TAG" ]; then case "$TAG" in
'!--')
InfoMsg "CMT: $PROP"
[ -n "$PROP" ] && PrintTag '!--' "$PROP"
;;
*)
# Msg "HTML: $TAG - ${root} - W:${CWP} - C:${CCP}"
CWP="${cwp}${CCP}"
PrintTag
;;
esac; fi; done
CWP="$cwo"
}
HtmlUnraw()
{
if [ $BASH_VERSINFO -gt 3 ]; then
OUT="${1//</<}"
OUT="${OUT//>/>}"
OUT="${OUT//&/&}"
OUT="${OUT//"/\"}"
OUT="${OUT// / }" OUT="${OUT// / }"
OUT="${OUT//'/\'}" OUT="${OUT//'/\'}"
else
OUT=$(echo "$1" | sed -e 's/</</g' -e 's/>/>/g' -e 's/"/"/g' -e 's/&/\&/g' -e 's/ / /g' -e 's/ / /g' -e "s/'/\'/g" -e "s/'/\'/g")
fi
}
DoContentTag()
{
local html
# Basic HTML encoded special character conversion ... and send it on down the processor
if [ $RAWCONTENT -lt 2 ]; then
HtmlUnraw "$2"
html="$OUT"
InfoMsg "HTML: $html"
fi
case $RAWCONTENT in
0) echo "$html" | FlattenHtml ;; # Variables changed will be in the fork, not here (ideally we'd like LASTWP back)
1) PrintLine "$1" "$html" ;;
*) PrintLine "$1" "$2" ;;
esac
}
DoRaw()
{
while ReadRaw; do
echo "T: $TAG"
done
DbgMsg 'Exit Raw'
}
DoXML()
{
local c p d r
# Just debug the reader
[ $READRAW -ne 0 ] && DoRaw && return 0
# Scan to start of an XML section (mostly the first line)
while [ "$TAG" != '?xml' ]; do
ReadTag || exit 0 # XML header
done
TAG="${TAG#?}"
PROP="${PROP%?}"
PrintTag 'xml'
# Next tag is DTD by XML definition; RSS Feeds don't have !doctype - they go right into feed
# Scan until root
while [ -z "$ROOTELEM" ]; do
ReadXmlTag || exit 0
# <?xml version="1.0" encoding="UTF-8"?>
# <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
# <plist version="1.0">
case "$TAG" in
![Dd][Oo][Cc][Tt][Yy][Pp][Ee])
c='class realm flavor url prop'
d=' '
while [ -n "$PROP" ]; do
p="${PROP%%${d}*}"
p="${p%${d}}"
# Msg "### dtd: $p [$PROP][$d]"
PrintLine "dtd.${c%% *}" "$p"
[ -z "$r" ] && r="$p"
p="${PROP#*${d}}"
p="${p# }"
[ "${p:0:1}" = \" ] && d=\" || d=' '
[ "$p" = "$PROP" ] && PROP='' || PROP="$p"
PROP="${PROP#${d}}"
c="${c#* }"
done
;;
'!--') PrintTag '!--' "$PROP" ;;
*)
# The first non xml/doctype/comment element - we could verify it matches 'class' from DTD
DEPTH=1
ROOTELEM="$TAG"
[ -n "$r" -a "$r" != "$TAG" ] && Msg "*** Top level does not match DTD class ($r != $TAG)"
PrintLine 'root' "$ROOTELEM"
PrintTag 'root'
;;
esac
done
# Now inside XML
InfoMsg "ENTER: XML $ROOTELEM ($CWP)"
FlattenXml # Start at the top tag
InfoMsg "LEAVE: XML $ROOTELEM"
}
DoTheRightThing()
{
if [ $OUTCSV -ne 0 ]; then
([ $IGNOREXML -eq 0 ] && DoXML || FlattenXml) | OutCSV
else
[ $IGNOREXML -eq 0 ] && DoXML || FlattenXml
fi
}
FieldCSV()
{
local v="$1"
if [ "$v" != "${v#*,}" -o "$v" != "${v#*${NEWLINE}}" -o "$v" != "${v#*\"}" ]; then
if [ $BASH_VERSINFO -gt 3 ]; then
v="${v//\"/\"\"}"
v="${v//,/v:^:v}"
v="${v//$NEWLINE/
}"
else
v=$(echo "$v" | sed -e "s/\"/\"\"/g" -e "s/$NEWLINE/\n/g" -e 's/,/v:^:v/g')
fi
v="\"${v}\""
fi
OUT="$v"
}
FieldCSX()
{
# Restore commas inside fields
local v="$1"
if [ "$v" != "${v#*v:^:v}" ]; then
if [ $BASH_VERSINFO -gt 3 ]; then
v="${v//v:^:v/,}"
else
v=$(echo "$v" | sed -e 's/v:^:v/,/g')
fi
fi
echo "$v"
# OUT="$v"
}
OutCSV()
{
local line t first L cnt title s p n f P
first=''
cnt=0
title=''
L=''
P=0 # Processing?
while read line; do if [ $P -ne 0 -o "${line#${SUD}}" != "$line" ]; then
P=1 # Processing after first item
t="${line%%=*}" # First field name
[ "$t" != "${t%/td/*}" ] && t="${t%/td/*}/td"
FieldCSV "${t##*${DDD}}" # Base name
t="$OUT"
FieldCSV "${line#*=}" # Value part
v="$OUT"
case "$t" in
'th')
HtmlUnraw "$v"
[ -z "$title" ] && title="$OUT" || title="${title},$OUT"
cnt=$((1 + $cnt))
;;
'td') # Could be td/div/div/span=
[ -z "$first" ] && first="${title##,*}" && FieldCSX "$title"
# Get array item number
p="${line%%=*}"
p="${p%/td*}"
p="${p##*#}"
[ $p -eq 1 -a -n "$L" ] && FieldCSX "$L" && L=''
HtmlUnraw "$v"
[ -z "$L" ] && L="$OUT" || L="${L},$OUT"
;;
*)
# Find field in title
s="$title"
p=0
n=0
while [ -n "$s" -a $p -eq 0 ]; do
n=$((1 + $n))
[ "${s%%,*}" = "$t" ] && p=$n
[ "${s#*,}" = "$s" ] && s='' || s="${s#*,}"
done
if [ $p -eq 0 ]; then
# New field
p=$n
[ $p -gt $cnt ] && cnt=$p
[ -z "$title" ] && title="$t" || title="${title},$t"
[ -z "$L" ] && L="$v" || L="${L},$v"
else
# Add field - if it collides, print line and start new
n=$p
s="$L"
L=''
while [ -n "$s" -a $n -gt 0 ]; do
f="${s%%,*}"
[ -z "$L" ] && L="$f" || L="${L},$f"
[ "${s#*,}" = "$s" ] && s='' || s="${s#*,}"
n=$(($n - 1))
done
if [ $n -gt 0 ]; then
# We have to create fields
while [ $n -gt 1 ]; do
L="${L},"
n=$(($n - 1))
done
L="${L},$v"
elif [ -z "$f" ]; then
# Empty field, replace it
L="${L},$v,$s"
else
# Collision, new row
[ -z "$first" ] && first="${title##,*}" && FieldCSX "$title"
[ -n "$s" ] && L="${L},${s}"
FieldCSX "$L"
L=''
while [ $n -gt 0 ]; do
L="${L},"
n=$(($n - 1))
done
[ -n "$L" ] && L="${L},$v" || L="$v"
fi
fi
;;
esac
fi; done
[ -n "$L" ] && FieldCSX "$L"
}
GetURL()
{
# Read a URI, URL, via curl or wget, or file, via cat, without file:// protocol
# If file does not contain a '<', and plutil is available, use that to convert to xml1 format
local w f e
if [ "${1%%://*}" != "$1" ]; then
# 2023 - wget 403ing on reddit.com - curl with -L successful
[ -z "$w" ] && w=$(which 'curl' 2>/dev/null) && f='-L -sA "'"$WUA"'" -H "'"Referer: $WRE"'"'
[ -z "$w" ] && w=$(which 'links' 2>/dev/null) && f='-dump'
[ -z "$w" ] && w=$(which 'wget' 2>/dev/null) && f='-O - -U "'"$WUA"'" "--referer='$WRE'"'
if [ -z "$w" ]; then
echo "Need curl or wget to get URLs"
exit 1
fi
InfoMsg "Loading: $1 with $w $f"
$w $f "$1" 2>/dev/null
e=$?
DbgMsg "Loaded ($e): $1"
elif [ -f "$1" ]; then
if which 'plutil' > /dev/null 2>&1; then
# Text xml?
if grep -q '?xml' "$1"; then
cat "$1"
elif plutil "$1" > /dev/null 2>&1; then
plutil -convert 'xml1' -o - "$1"
else
# Some random thing
cat "$1"
fi
else
# No plutil to consider
cat "$1"
fi
else
echo "$1 not found" >&2
fi
}
Usage()
{
local b=$(basename "$0")
Msg '-' "Usage: $b [-a][-b][-C][-c][-d][-e][-E c][-f][-h][-i][-k][-K c][-l][-m path][-n #][-Q c][-q][-s suffix][-t][-V][-x][-?] [URL or pathToFile, will use stdin if available]
-a Array item numbers blank (useful when searching for any array item)
-b Ignore blank lines toggle (default ON)
-C Output CSV format
-c Increase rawness content fields (parsed as XML -> HTML -> raw)
-d Increase verbosity
-e Expand full paths (vs compressed . for each matching component to parent)
-E c Change key-value separator (default $EQC)
-f Stop parsing after first match (see -m) per stream. Can specify multiple for multiple matches.
-h Ignore search for ?xml tag to start (e.g. parse HTML)
-i Disable auto indexer (array detector)
-k Dict/key compression toggle (default !$EXPAND) - flatten dict/key into cwp${DKD}keyname/type${EQC}value
-K Set dict/key compression character (default $DKD)
-l Line break conversion to ;; for values (happens for comments by default)
-L crc Change ;; to this for newline replacement
-m pth Match - only output if /path matches given path (at head)
-n cnt Next n fields after match included (-s applies)
-q Do not prefix each line with /path${EQC}
-Q c Quote character before and after values
-r Raw mode - just to test tag parsing code
-s sfx Suffix match - match backside (e.g. .jpg)
-t Ignore content layout tags like tables, divs, spans
-V Output in bash executable KV format (a=b where a is a legal variable name and b a quoted value)
-x Output XML ... work in progress
-y Output YML ... work in progress
or - cat file.xml | $b ...
Examples
Version of macOS thing: xmlp -m ${DKD}CFBundleShortVersionString -q -f /System/Applications/Mail.app/Contents/Info.plist
Reddit RSS images with URLs: xmlp -n 1 -m '${SUD}entry${SUD}content${SUD}a.href' -s '.jpg' -q 'https://reddit.com/r/cityporn/rising/.rss'
Use StdIn: cat /path/to/xmlfile.xml | xmlp
CSV Output: xmlp -C https://www.w3schools.com/xml/plant_catalog.xml
Issues: Wider range of XML file testing, more intelligent choices
Version $gVersion from $gStamped
"
}
URL=''
while getopts 'abCcdeE:fhiK:kL:lm:n:Q:qrs:tVxy?' o; do case "$o" in
'a') ARRAYNUMS=$((1 - $ARRAYNUMS)) ;; # Toggle showing array item numbers
'b') IGNOREBLANK=$((1 - $IGNOREBLANK)) ;; # Don't print lines with no value
'C') OUTCSV=$((1 + $OUTCSV)) ;; # Output CSV format
'c') RAWCONTENT=$((1 + $RAWCONTENT)) ;; # Rawness of HTML ouput (parsed, HTML, raw)
'd') DEBUG=$((1 + $DEBUG)) ;;
'e') EXPAND=$((1 + $EXPAND)) ;; # Expand output from compressed -> paths -> XML (XML not yet done)
'E') EQC="$OPTARG" ;; # Equal character
'f') ONLYFIRST=$((1 + $ONLYFIRST)) ;; # Return after finding first MATCH (-m)
'h') IGNOREXML=$((1 - $IGNOREXML)) ;; # Allow HTML?
'i') AUTOIDX=$((1 - $AUTOIDX)) ;; # Array detector
'k') FLATKEY=$((1 - $FLATKEY)) ;; # Toggle flatten keys in dicts
'K') DKD="$OPTARG" ;; # Set dict/key delimiter
'l') MULTILINE=$((1 - $MULTILINE)) ;; # Toggle flatten multilines with LF -> $NEWLINE conversion
'L') NEWLINE="$OPTARG" ;; # \n replacement string
'm') MATCH="$OPTARG" ;; # Path match string
'n') NEXTS="$OPTARG" ;; # Include n lines after SUFFIX is hit
'Q') QUOTE="$OPTARG" ;; # Quote output values
'q') ONLYVALUE=$((1 - $ONLYVALUE)) ;; # No path prefix
'r') READRAW=1 ;; # Tag parser test
's') SUFFIX="$OPTARG" ;; # Only print lines whose value ends with SUFFIX (e.g. .jpg)
't') IGNORELAYOUT=$((1 - $IGNORELAYOUT)) ;; # Ignore content layout
'V') OUTKVO=$((1 + $OUTKVO)) ;; # Output KVO format
'x') OUTXML=$((1 + $OUTXML)) ;; # Output XML again
'y') OUTYML=$((1 + $OUTYML)) ;; # Output YML
'?') Usage; exit 0 ;;
*) Msg "Unknown option: $1" ;;
esac; done
shift $(($OPTIND - 1))
for u in "$@"; do
URL="$URL
$u"
done
# Set some KVO parameters
if [ $OUTKVO -ne 0 ]; then
AUTOIDX=1
EXPAND=1
QUOTE=\'
# DDD replacements
SUD='_'
SND='_'
PRD='_'
# Can not be in above for DDD reasons
IXD='x'
fi
# If we get a URL - use it, else if no stdin - Usage - else process stdin
if [ -n "$URL" ]; then
echo "$URL" | while read line; do if [ -n "$line" ]; then
GetURL "$line" | DoTheRightThing
fi; done
elif [ -t 0 ]; then
Usage
else
DoTheRightThing
fi