xmlp

#!/bin/bash
gVersion=0.22
gStamped=240913

# Catch interupts to clean up
trap DoHUP TERM HUP
trap DoINT INT

#
#	xmlp (c) 2020-23 Greg Kerr, Akua Co.
#
#	License: Public Domain
#
#	History
#	022	GLK	240913	Friday the 13th! - Trying to catch erros on 464/477 with cron job saying amp: command not found
#	021	GLK	230902	Failed to wget reddit.com - made curl and links higher priority, added WRE for referer (probably not needed)
#	020	GLK	230610	Merge in YML start from Rock
#	019	GLK	230516	Treat tr tags as array if encountered while parsing XML so -he can convert a table snippet
#	018	GLK	230327	EQC for = symbol
#	017	GLK	230320	Some bug in comments with -> in them causes text until over-next --> to be included in comment, fix Usage
#	016	GLK	230313	Support wrapped tags in comments - e.g. <!-- <disabled>Something</disabled> -->
#	015	GLK	230312	CompressTag on DKD starting paths based on DKD* instead of DDD* (Dict/Key items compress based on Dict, not path)
#	014	GLK	230311	-V option for KVO format - bash executable key value output
#	013	GLK	230310	AutoIndex option, feature
#	012	GLK	230309	Better comments handling
#	011	GLK	230306	If no < in a file, and plutil available, auto convert binary plist to plist
#	010	GLK	230113	Document -a option, handle comments (multi-line values still a propblem)
#	009	GLK	221224	Forget when I made last change, but root=feed coming out on top of search for jpgs in rss
#	008	GLK	210207	Start to create new compact format with tabs instead of . and array/dict first item on next line
#	007	GLK	201213	Support -e for current behaviour, compress paths with . matching parent component
#	006	GLK	201124	Support array indexing on by default, -a to toggle
#	005	GLK	201121	Support multiple inputs and file or URL as argument
#	004	GLK	201120	Show Usage if no stdin and no args
#	003	GLK	200823	Add -f flag to return after first match; add -k flag to flatten key/value couplets into /key/name/type:value lines
#	002	GLK	200817	Add -n flag to get URL for JPG in RSS feeds from Reddit
#	001	GLK	200711	Fix <br/> issue - I think fixed on another machine already

#	Todo
#	230312	DONE	Support > in comments
#	230311	DONE	Support CSV output
#	230310	DONE	Change ':' to something else as : is common in rss feed tags

# Compact format
# Key=value
# Key uncompressed
#	/ -> child element
#	~ -> child singlet (no further children, but perhaps properties)
#	. -> property
#	#x -> Array element #x
#	$DTD -> dict member
# Key compressed first character
#	/ Top level path
#	: Top level <key>name</key> in a dict (:name)
#	~ Top level singlet
#	- backup a level
# XML Format
#	Key preceded by number of tabs as prior key paths

# Static
WUA='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
WRE='http://www.akua.com/refer-me-to-your-leader/'

# Globals config defaults - settable by cl options
DEBUG=0			# Noisy?
RAWCONTENT=0	# Process html content tags as XML ... scale 0 to 2
IGNORELAYOUT=1	# Ignore table/div/span constructs in html
IGNOREBLANK=1	# Don't dump empty items
IGNOREXML=0		# Handle HTML?
ONLYFIRST=0		# Return after finding first match
FLATKEY=1		# Turn dict <key>A</key><string>s</string> into path/A/string/s
ARRAYNUMS=1		# Turn # for array items into #idx where idx increments each array item
MATCH=''		# Only print matching path
SUFFIX=''		# Only print if value has this suffix
NEXTS=0			# Number of MATCH lines to print after SUFFIX is hit (e.g. to include following html line after jpg matches)
NEXTC=0			# This is set to NEXTS when SUFFIX matches
ONLYVALUE=0		# No tag path: prefix
EXPAND=0		# Expanded full paths per line vs . per matching component
OUTXML=0		# Output XML again (after edits presumably)
OUTYML=0		# Output streaming YAML
MULTILINE=0		# Convert new line to $NEWLINE in values (happens for all comments)
NEWLINE=';;'	# Convert \n to this if MULTILINE -ne 0
READRAW=0		# Just dump tag reading
AUTOIDX=0		# Detect array of repeating structures and apply index
OUTKVO=0		# Output KVO form (executable by bash)
OUTCSV=0		# Output CSV format (may be crazy)

# Global current tag in process
ROOTELEM=''		# ROOTELEM - first element after <?xml, only once per doc - we start parsing here
ENTITY=''		# TAG[ prop1=2][ prop3=4][/]
CONTENT=''		# Text between <TAG ...> and </TAG>
TAG=''			# TAG of entity e.g. <TAG prop=1 prop2=3>, case sensitive by XML specification
CMT=''			# Parsed comment tag
CMTN=0			# Comment index (for identification)
PROP=''			# properties of TAG
SINGLET=0		# Is the tag a singlet (ends with /> like img, link)
LASTWP=''		# Last tag output (to compress path)
LASTDEEP=0		# Last depth output (new compression)
ARRAYIDX=0		# In nth number of array
DEPTH=0			# How deep into the hierarchy are we?
CDEEP=0			# How deep into the content hierarchy are we?
CWP=''			# Current working path of parse (should start and end with #), SINGLETs don't effect, <tag> pushes, </tag> pops
CCP=''			# Current content tag path
SKIPLINE=0		# Skipping current line - don't print Fields
AUTOX=''		# Auto Index Counters
AUTON=0			# Current Auto Index
PRIOR=''		# Prior tag (can change behavior - e.g. 'key' prior makes IGNOREBLANK ignored for next)
PRION=''		# Prior popped tag (came out of)
QUOTE=''		# Place before and after value outputs

# Document Global
HASARRAY=0		# Encountered array (for CSV HTML table vs CVS xml) - does not work because of pipe

# Delimiters
SUD='/'			# Subelement Delimiter
DKD='%'			# Dict/Key compressed delimiter - can not be %
SND='~'			# Singlet delimiter
PRD='.'			# Property delimiter
DDD="[$SUD$DKD$SND$PRD]"	# For path compression Path delimiters (LASTWP)
IXD='^'			# Index delimiter - within a tag
DAD='.'			# Path compression, use prior path - last [DDD] component + rest after DAD
EQC='='			# Key=Value separator

USER=${USER:-$(id -un)}
LOGF="/tmp/$USER-${0##*/}.log"
Msg()
 {
	local t
	[ "$1" = '-' ] && shift || ([ -n "$EPOCHSECONDS" ] && t="$EPOCHSECONDS" || t=$(date '+%s'); t="$((1000 + ($t % 3600))): ")
	echo "$t$@" >&2
 }
Info()	{	[ $DEBUG -gt 0 ] && Msg "$@"; }
Dbg()	{	local msg; [ $DEBUG -gt 1 ] && msg="$1" && shift && Msg "====> $msg" "$@"; }
Log()	{	Msg "$@" 2>&1 | tee -a "$LOGF" >&2;			}
Throw()	{	[ -n "$1" ] && Log "=== $1 ==="; exit ${2:-22}; exit 9;	}

DoINT()
 {
#	Stop
	# Must untrap and harakiri to properly INT and QUIT (but bash ignores QUIT) https://mywiki.wooledge.org/SignalTrap#preview
	trap - INT
	kill -s INT $$
 }

DoHUP()
 {
#	Stop
	Throw "${1:-End}" "${2:-0}"
 }

AutoEnter()
 {
	local	p d n

	p="${AUTOX#:$1:}"
	if [ "$p" = "$AUTOX" ]; then
		AUTOX="${AUTOX}:$1:$2:1:"
		AUTON=1
	else
		d="${p%%:*}";	p="${p#*:}"
		n="${p%%:*}"	p="${p#*:}"
		n=$((1 + $n))
		AUTOX="${AUTOX%%:$1:*:*:}:$1:$2:$n:${AUTOX##:$1:*:*:}"
		Dbg "AE: $AUTOX [$n]"
		AUTON=$n
	fi
 }

AutoLeave() 
 {
	Dbg "Nothing yet"
 }

ReadRaw()
 {
	local IFS=\>

	Dbg 'RR'
	if read -r -d \< -t 33 ENTITY CONTENT; then
#		Dbg "E: $ENTITY"
#		Dbg "C: $CONTENT"
		CMT="${ENTITY#!--}"				# Comment also single tag element - only support comments that have no '>' in them (terminator)
		CONTENT="${CONTENT%"${CONTENT##*[![:space:]]}"}"	# Remove trailing whitespace characters
		if [ "$CMT" != "$ENTITY" ]; then
			[ -n "$CONTENT" ] && CMT="${ENTITY}>${CONTENT%>}"
			# Get rest of comment up to -->
			while [ "$CMT" = "${CMT%--}" ]; do
				Dbg "C: $CMT"
				if read -r -d \< -t 33 TAG; then
					Dbg "D: $TAG"
					TAG="${TAG%"${TAG##*[![:space:]]}"}"			# Remove trailing whitespace characters
					[ "$TAG" != "${TAG%-->}" ] && TAG="${TAG%>}"	# At end of comment?
					CMT="${CMT}<$TAG"
				else
					CMT="${CMT}--"
				fi
			done
			PROP="${CMT%--}"
			TAG='!--'
		else
			CONTENT="${CONTENT#"${CONTENT%%[![:space:]]*}"}"	# Remove leading whitespace characters	
			ENTITY="${ENTITY%"${ENTITY##*[![:space:]]}"}"		# Remove trailing whitespace characters
			SINGLET=1
			TAG="${ENTITY%/}"				# Single tag element? (ends with / or starts with !)
			[ "$TAG" = "$ENTITY" ] && SINGLET=0 || TAG="${TAG%"${TAG##*[![:space:]]}"}"		# Remove trailing whitespace characters
			PROP="${TAG#*[[:space:]]}"		# Break out tag and properties (\t \n \v \f \r " "), blank is space,tab
			PROP="${PROP%[[:space:]]}"		# Dangling space in singlet
			TAG="${TAG%%[[:space:]]*}"		# Space separator after TAG?
		fi
		Dbg "e: $ENTITY"
		Dbg "c: $CONTENT"
		Dbg "t: $TAG [$SINGLET]"
		Dbg "p: $PROP"
		return 0
	fi
	return 1
 }

ReadTag()
 {
	# Split words at >, split lines at <
	# example BODY a=b c="Some text">Test<NextTag> -> ENTITY = BODY ... text", CONTENT = Test, pos at NextTag>; TAG=BODY
	local IFS=\>

	# -d delimiter, -r backslash not a shell char, -t timeout in seconds
	if read -r -d \< -t 33 ENTITY CONTENT; then
		CMT="${ENTITY#!--}"				# Comment also single tag element - only support comments that have no '>' in them (terminator)
		CONTENT="${CONTENT%"${CONTENT##*[![:space:]]}"}"	# Remove trailing whitespace characters
		if [ "$CMT" != "$ENTITY" ]; then
			[ -n "$CONTENT" ] && CMT="${ENTITY}>${CONTENT%>}"
			# Get rest of comment up to -->
			while [ "$CMT" = "${CMT%--}" ]; do
				if read -r -d \< -t 33 TAG; then
					TAG="${TAG%"${TAG##*[![:space:]]}"}"			# Remove trailing whitespace characters
					[ "$TAG" != "${TAG%-->}" ] && TAG="${TAG%>}"	# At end of comment?
					CMT="${CMT}<$TAG"
				else
					CMT="${CMT}--"
				fi
			done
			PROP="${CMT%--}"
			TAG='!--'
		else
			CONTENT="${CONTENT#"${CONTENT%%[![:space:]]*}"}"	# Remove leading whitespace characters	
			ENTITY="${ENTITY%"${ENTITY##*[![:space:]]}"}"		# Remove trailing whitespace characters
			SINGLET=1
			TAG="${ENTITY%/}"				# Single tag element? (ends with / or starts with !)
			[ "$TAG" = "$ENTITY" ] && SINGLET=0 || TAG="${TAG%"${TAG##*[![:space:]]}"}"		# Remove trailing whitespace characters
			PROP="${TAG#*[[:space:]]}"		# Break out tag and properties (\t \n \v \f \r " "), blank is space,tab
			PROP="${PROP%[[:space:]]}"		# Dangling space in singlet
			TAG="${TAG%%[[:space:]]*}"		# Space separator after TAG?
			[ "$PROP" = "$TAG" ] && PROP=''
		fi
		# TAG (Element Name) by spec are case sensitive, but attributes not
		# [ $BASH_VERSINFO -gt 3 ] && TAG="${TAG,,*}" || TAG=$(echo "$TAG" | tr '[:upper:]' '[:lower:]')				# ... lower case all tags for match - fails on Darwin's bash 3.x - spec says case matters
		Dbg "RTAG: $TAG [$PROP]$CONTENT[$SINGLET|$DEPTH|$CWP]"
		return 0
	fi
	return 1
 }

ReadXmlTag()
 {
	local	aIdx match
	local	pTag="${CWP##*/}"
	local	pDic="${pTag#*${DKD}}"
	local	del="$SUD"

	if ReadTag; then
		if [ $SINGLET -eq 0 -a -n "$TAG" ]; then
			# Convert dict & array to macro, join dict/key into ${DKD}keyname construct
			if [ $FLATKEY -ne 0 ]; then case "$TAG" in
			  'key')
				if [ $BASH_VERSINFO -gt 3 ]; then
					CONTENT="${CONTENT//\//_}"
					CONTENT="${CONTENT//:/+}"
				else
					CONTENT=$(echo "$CONTENT" | tr '/:' '_+')
				fi
				if [ "$pTag" = 'dict'  ]; then	# First level of dictionary
					CWP="${CWP%[${SUD}]*}"
					TAG="$CONTENT/key"	# Dict name becomes Tag
					CONTENT=''
					del="$DKD"
				elif [ "$pDic" != "$pTag" ]; then
					CWP="${CWP%[${DKD}]*}"
					TAG="$CONTENT/key"
					CONTENT=''
					del="$DKD"
				else
					TAG="$CONTENT/key"
					CONTENT=''
				fi
			  ;;
			  '/array'|'/tr')
				pDic="${pTag%#*}"
				# Revert array number in path to array
				[ "$pDic" != "$pTag" ] && CWP="${CWP%${SUD}#*}${SUD}${TAG#/}"
				HASARRAY=1
			  ;;
			  '/dict')
				# Revert path from $DKD to being a dict
				[ "$pDic" != "$pTag" ] && CWP="${CWP%[${DKD}]*}${SUD}dict"
			  ;;
			  *)
				case "$pDic" in
				  'array'|'tr')
					AUTOIDX=0	# Have arrays, turn this off
					ARRAYIDX=1
					if [ $ARRAYNUMS -eq 0 ]; then
						aIdx=''
						# PrintTag "${CWP}" '-'
					else
						aIdx="$ARRAYIDX"
					fi
					CWP="${CWP%/*}${SUD}#$aIdx"
				  ;;
				  '#'[0-9]*)
					ARRAYIDX=$((1 + $ARRAYIDX))
					if [ $ARRAYNUMS -eq 0 ]; then
						aIdx=''
						# PrintTag "${CWP}" '-'
					else
						aIdx="$ARRAYIDX"
					fi
					CWP="${CWP%${SUD}*}${SUD}#$aIdx"
				  ;;
				esac
			 ;;
			esac; fi

			# Singlets are processed as a whole - so we are either going deep or popping up
			if [ "$TAG" = '!--' ]; then	# Comment? Don't change depth or CWP (Path)
				Dbg "Comment: $PROP"
			elif [ "${TAG#/}" = "$TAG" ]; then	# Going deeper (not ending a block)
				# Msg "-------------> Push $TAG onto ($CWP - $pDic - $PROP - $CONTENT)"
				CWP="${CWP}${del}$TAG"
				DEPTH=$((1 + $DEPTH))
#				Msg "*** $PRION : $TAG ($PRIOR)"
				[ $AUTOIDX -ne 0 -a "$PRION" = "/$TAG" ] && AutoEnter "$TAG" $DEPTH && CWP="${CWP}${IXD}$AUTON"
			else
				match="${CWP##*${SUD}}"
				match="${match%${IXD}*}"
				[ -z "$match" ] && match="$ROOTELEM"
				[ "${TAG#/}" != "$match" ] && Msg "*** XML Tag mismatch at $CWP [$match] (${TAG#/} expected)"
				# Msg "<------------ Pop $TAG from ($CWP - $CONTENT)"
				DEPTH=$(($DEPTH - 1))
				# If not compressing, must indicate end of record
				[ $EXPAND -ne 0 -a $FLATKEY -eq 0 ] && PrintTag "$TAG" '.'
				PRION="$TAG"
				[ $DEPTH -eq 0 ] && CWP='' || CWP="${CWP%${SUD}*}"
				TAG=''							# Coming up
			fi
		fi
		return 0
	fi
	return 1
 }


ReadHtmlTag()
 {
	if ReadTag; then
		case "$TAG" in
		  tr|td|table|div|span|br|p|/tr|/td|/table|/div|/span|/p)
			[ $IGNORELAYOUT -ne 0 -a -z "$PROP" ] && TAG=''
		  ;;
		esac

		if [ $SINGLET -eq 0 -a -n "$TAG" ]; then
			if [ "${TAG#/}" = "$TAG" ]; then	# Going deeper
				CCP="${CCP}${SUD}$TAG"
				CDEEP=$((1 + $CDEEP))
			else
				[ "${TAG#/}" != "${CCP##*${SUD}}" ] && Msg "*** HTML Tag mismatch at $CCP (${TAG#/} expected)"
				CDEEP=$(($CDEEP - 1))
				CCP="${CCP%${SUD}*}"
				TAG=''							# Coming up
			fi
		fi
		return 0
	fi
	return 1
 }


PrintLine()
 {
	# Print one line of output - check MATCH, ONLYVALUE, IGNOREBLANK parameters
	local	m tag val

	tag="$1"
	val="$2"

	Dbg "LINE: $tag [$val] [$MATCH] [$SUFFIX] $IGNOREBLANK [$PRIOR]"

	# Skip line if not a match
	[ -n "$MATCH" -a "${tag#$MATCH}" = "$tag" ] && SKIPLINE=1 && return 0

	SKIPLINE=0
	[ $SINGLET -ne 0 -a "$PRIOR" = 'key' ] && val="$SND$TAG"
	[ -n "$val" -o $IGNOREBLANK -eq 0 ] && PrintField "$tag" "$val"

	return 0
 }


PrintProps()
 {
	local	i p v
	local	tag="$1"
	shift

	Dbg "PROP: [$tag] [$@] [$#]"
	[ $SINGLET -ne 0 ] && tag="${tag}$SND${TAG}" && Dbg "SINGLET: $tag [$TAG]"

	for i in "$@"; do
		# Dbg "PROP: ($i) ($TAG) ($@)"
		p="${i%%=*}"	# Split property from value
		v="${i#*=}"
		v="${v#\"}"		# Chop off quoting
		v="${v%\"}"
		v="${v#\'}"
		v="${v%\'}"
		PrintLine "$tag$PRD$p" "$v"
	done
 }


MultiLine()
 {
	# Echo input with /n replaced with $NEWLINE
	local	l line
	while read l; do
		line="${line}${NEWLINE}$l"
	done
	echo "${line#$NEWLINE}"
 }


PrintTag()
 {
	# Print $1 tag's properties if any in $@
	local	tag

	tag="$1"
	[ -n "$2" ] && PROP="$2"
	shift
	[ -z "$tag" ] && tag="$CWP" && Dbg "TCWP: $tag [$PROP]"

	if [ $OUTYML -eq 0 ]; then
		Dbg "PTAG: [$tag] [$PROP] $IGNOREBLANK [$PRIOR]"
		if [ "$TAG" = '!--' ]; then
			PROP=$(echo "$PROP" | MultiLine)
			PrintLine '#' "$PROP"
			CMTN=$((1 + $CMTN))
		else
			[ $MULTILINE -ne 0 ] && PROP=$(echo "$PROP" | MultiLine)
			if [ "$PROP" != "${PROP#*\&}" ]; then
#				Msg "*** [$tag][$PROP]"					# PROP can have '&amp;'
				HtmlUnraw "$PROP"; PROP="$OUT"
				HtmlUnraw "$tag"; tag="$OUT"
#				Msg "+++ [$tag][$PROP]"					# PROP can have '&amp;'
			fi
			# Got a spurious error on next line saying: amp: command not found
			eval PrintProps \"$tag\" $PROP 2>/dev/null || Log "Err 464 [$tag][$PROP]"				# Need to get props as "$@" - for i in $PROP fails to break words properly (without eval, HTML parse failed on two word alt img tag)
			[ $MULTILINE -ne 0 ] && CONTENT=$(echo "$CONTENT" | MultiLine)
			PrintLine "$tag" "$CONTENT"
		fi
	elif [ $OUTXML -eq 0 ]; then
		Dbg "PTAG: [$tag] [$PROP] $IGNOREBLANK [$PRIOR]"
		if [ "$TAG" = '!--' ]; then
			PROP=$(echo "$PROP" | MultiLine)
			PrintLine "__$CMTN" "$PROP"
			CMTN=$((1 + $CMTN))
		else
			[ $MULTILINE -ne 0 ] && PROP=$(echo "$PROP" | MultiLine)
			# Got a spurious error on next line saying: amp: command not found
			eval PrintProps \"$tag\" $PROP 2>/dev/null || Log "Err 477 [$tag][$PROP]"		# Need to get props as "$@" - for i in $PROP fails to break words properly (without eval, HTML parse failed on two word alt img tag)
			[ $MULTILINE -ne 0 ] && CONTENT=$(echo "$CONTENT" | MultiLine)
			PrintLine "$tag" "$CONTENT"
		fi
	else
		# Untested and makes no sense
		if [ $SINGLET -ne 0 ]; then
			PrintLine "<$TAG/>" || [ -z "$PROP" ] && PrintLine "${CWP}:" "$TAG"
		else
			PrintLine "<$TAG $PROP>$CONTENT</$TAG>" || PrintLine "$tag" "$CONTENT"
		fi
	fi
 }


CompressTag()
 {
	# Compress tag path with prior output path if applicable
	# Rules
	#	If array, drop left of array item path (${SUD}# delimiter)
	#	If key/dict, drop left of key/dict
	#	Else drop left of any of 4 path delimiters ($DDD) - array, keydict, item, singlet
	local	a
	local	t="$1"
	local	m=1			# Matched so far

	Dbg "*** COMP: [$LASTWP][$t][$CWP]"

	a="${t#$LASTWP}"	# Try lopping off LastWP
	if [ "${t#*${SUD}#}" != "$t" ]; then
		LASTWP="${t%${SUD}#*}"
	elif [ "${t#*%}" != "$t" ]; then
		LASTWP="${t%${DKD}*}"
	else
		LASTWP="${t%${DDD}*}"
	fi
	[ "$a" != "$t" ] && t="${DAD}$a"
	OUT="$t"
	return 0
 }


PrintField()
 {
	# Called from PrintLine
	# Output "${1}White space separated items from $2"
	local	f p l tag n
	tag="$1"
	f="$2"
	l=''
	f="${f%"${f##*[![:space:]]}"}"		# Remove trailing whitespace characters
	while [ -n "$f" ]; do
		f="${f#"${f%%[![:space:]]*}"}"	# Remove leading whitespace characters  
		p="${f%%[[:space:]]*}"			# Get first "field"
		l="$l $p"
		[ "$p" = "$f" ] && f='' || f="${f#*[[:space:]]}"
	done
	l="${l# }"
	Dbg "FIELD: $tag [$2] [$p] [$l] [$EXPAND]"

	if [ $SKIPLINE -eq 0 ]; then
		if [ -n "$SUFFIX" ]; then
			f="${l%$SUFFIX}"
			if [ "$f" = "$l" ]; then		# Suffix mismatch
				[ $NEXTC -le 0 ] && return 0
				NEXTC=$(($NEXTC - 1))
			else
				NEXTC="$NEXTS"				# Suffix match, this many matching tags after this one will be included
			fi
		fi

		# Compress some tag path items - e.g. dict/key to $DKD
		if [ $OUTXML -eq 0 ]; then
			if [ $EXPAND -eq 0 -a "$TAG" != '!--' ]; then
				CompressTag "$tag"
				tag="$OUT"
			fi
			if [ $OUTKVO -ne 0 ]; then
				if [ $BASH_VERSINFO -gt 3 ]; then
					tag="${tag//:/_}"
				else
					tag=$(echo "$tag" | sed -e 's/:/_/g')
				fi
			fi
			if [ -n "$QUOTE" ]; then
				if [ $BASH_VERSINFO -gt 3 ]; then
					l="${l//$QUOTE/$QUOTE\\\\$QUOTE$QUOTE}"
				else
					l=$(echo "$l" | sed -e "s/$QUOTE/$QUOTE\\\\$QUOTE$QUOTE/g")
				fi
			fi
			[ $ONLYVALUE -eq 0 ] && echo "${tag}${EQC}${QUOTE}${l}${QUOTE}" || echo "${QUOTE}$l${QUOTE}"
		else
			for ((l=$DEPTH; l>0; l--)); do printf "\t"; done
			echo "<$tag>${l}</$tag>"
		fi

		[ $ONLYFIRST -eq 1 -a -n "$MATCH$SUFFIX" ] && exit 0 || ONLYFIRST=$(($ONLYFIRST - 1))
	fi
 }


FlattenXml()
 {
	CWP=''	# Start at root of whomever called me
	DEPTH=0

	while ReadXmlTag; do if [ -n "$TAG" ]; then case "$TAG" in
	  '!--')
		Info "CMT: $PROP"
		[ -n "$PROP" ] && PrintTag '!--' "$PROP"
	  ;;

	  'content')
		Dbg "CONTENT ($RAWCONTENT): $CONTENT"
		DoContentTag "$TAG" "$CONTENT"
		ReadXmlTag || return 0		# kill /content
	  ;;

	  *)
		Dbg "READ: $TAG [$PROP] [$CONTENT] [$CWP] [$OUTXML]"
		PrintTag
		Dbg "RADE: $TAG [$PROP] [$CONTENT] [$CWP]"
	  ;;
	esac; [ "$TAG" != '!--' ] && PRIOR="${TAG##*/}"; fi; done
 }


FlattenHtml()
 {
	local	cwp="$CWP"

	CCP=''
	CDEEP=0

	while ReadHtmlTag; do if [ -n "$TAG" ]; then case "$TAG" in
	  '!--')
		Info "CMT: $PROP"
		[ -n "$PROP" ] && PrintTag '!--' "$PROP"
	  ;;

	  *)
#		Msg "HTML: $TAG - ${root} - W:${CWP} - C:${CCP}"
		CWP="${cwp}${CCP}"
		PrintTag
	  ;;
	esac; fi; done

	CWP="$cwo"
 }

HtmlUnraw()
 {
	if [ $BASH_VERSINFO -gt 3 ]; then
		OUT="${1//&lt;/<}"
		OUT="${OUT//&gt;/>}"
		OUT="${OUT//&amp;/&}"
		OUT="${OUT//&quot;/\"}"
		OUT="${OUT//&#32;/ }"		OUT="${OUT//&#032;/ }"
		OUT="${OUT//&#39;/\'}"		OUT="${OUT//&#039;/\'}"
	else
		OUT=$(echo "$1" | sed -e 's/&lt;/</g' -e 's/&gt;/>/g' -e 's/&quot;/"/g' -e 's/&amp;/\&/g' -e 's/&#32;/ /g' -e 's/&#032;/ /g' -e "s/&#39;/\'/g" -e "s/&#039;/\'/g")
	fi
 }

DoContentTag()
 {
	local	html

	# Basic HTML encoded special character conversion ... and send it on down the processor
	if [ $RAWCONTENT -lt 2 ]; then
		HtmlUnraw "$2"
		html="$OUT"
		Info "HTML: $html"
	fi

	case $RAWCONTENT in
	  0)	echo "$html" | FlattenHtml	;;	# Variables changed will be in the fork, not here (ideally we'd like LASTWP back) 
	  1)	PrintLine "$1" "$html"		;;
	  *)	PrintLine "$1" "$2"			;;
	esac
 }

DoRaw()
 {
	while ReadRaw; do
		echo "T: $TAG"
	done
	Dbg 'Exit Raw'
 }

DoXML()
 {
	local c p d r
	# Just debug the reader
	[ $READRAW -ne 0 ] && DoRaw && return 0

	# Scan to start of an XML section (mostly the first line)
	while [ "$TAG" != '?xml' ]; do
		ReadTag || exit 0	# XML header
	done
	TAG="${TAG#?}"
	PROP="${PROP%?}"
	PrintTag 'xml'

	# Next tag is DTD by XML definition; RSS Feeds don't have !doctype - they go right into feed
	# Scan until root
	while [ -z "$ROOTELEM" ]; do
		ReadXmlTag || exit 0
		# <?xml version="1.0" encoding="UTF-8"?>
		# <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
		# <plist version="1.0">
		case "$TAG" in
		  ![Dd][Oo][Cc][Tt][Yy][Pp][Ee])
			c='class realm flavor url prop'
			d=' '
			while [ -n "$PROP" ]; do
				p="${PROP%%${d}*}"
				p="${p%${d}}"
				# Msg "### dtd: $p [$PROP][$d]"
				PrintLine "dtd.${c%% *}" "$p"
				[ -z "$r" ] && r="$p"
				p="${PROP#*${d}}"
				p="${p# }"
				[ "${p:0:1}" = \" ] && d=\" || d=' '
				[ "$p" = "$PROP" ] && PROP='' || PROP="$p"
				PROP="${PROP#${d}}"
				c="${c#* }"
			done
		  ;;
		  '!--')	PrintTag '!--' "$PROP"	;;
		  *)
			# The first non xml/doctype/comment element - we could verify it matches 'class' from DTD
			DEPTH=1
			ROOTELEM="$TAG"
			[ -n "$r" -a "$r" != "$TAG" ] && Msg "*** Top level does not match DTD class ($r != $TAG)" 
			PrintLine 'root' "$ROOTELEM"
			PrintTag 'root'
		  ;;
		esac
	done

	# Now inside XML
	Info "ENTER: XML $ROOTELEM ($CWP)"
	FlattenXml	# Start at the top tag
	Info "LEAVE: XML $ROOTELEM"
 }

DoTheRightThing()
 {
	if [ $OUTCSV -ne 0 ]; then
		([ $IGNOREXML -eq 0 ] && DoXML || FlattenXml) | OutCSV
	else
		[ $IGNOREXML -eq 0 ] && DoXML || FlattenXml
	fi
 }

FieldCSV()
 {
	local	v="$1"

	if [ "$v" != "${v#*,}" -o "$v" != "${v#*${NEWLINE}}" -o "$v" != "${v#*\"}" ]; then
		if [ $BASH_VERSINFO -gt 3 ]; then
			v="${v//\"/\"\"}"
			v="${v//,/v:^:v}"
			v="${v//$NEWLINE/
}"
		else
			v=$(echo "$v" | sed -e "s/\"/\"\"/g" -e "s/$NEWLINE/\n/g" -e 's/,/v:^:v/g')
		fi		
		v="\"${v}\""
	fi

	OUT="$v"
 }

FieldCSX()
 {
	# Restore commas inside fields
	local	v="$1"
	if [ "$v" != "${v#*v:^:v}" ]; then
		if [ $BASH_VERSINFO -gt 3 ]; then
			v="${v//v:^:v/,}"
		else
			v=$(echo "$v" | sed -e 's/v:^:v/,/g')
		fi		
	fi
	echo "$v"
#	OUT="$v"
 }

OutCSV()
 {
	local	line t first L cnt title s p n f P

	first=''
	cnt=0
	title=''
	L=''
	P=0		# Processing?

	while read line; do if [ $P -ne 0 -o "${line#${SUD}}" != "$line" ]; then
		P=1							# Processing after first item
		t="${line%%=*}"				# First field name
		[ "$t" != "${t%/td/*}" ] && t="${t%/td/*}/td"
		FieldCSV "${t##*${DDD}}"	# Base name
		t="$OUT"
		FieldCSV "${line#*=}"		# Value part
		v="$OUT"

		case "$t" in
		  'th')
			HtmlUnraw "$v"
			[ -z "$title" ] && title="$OUT" || title="${title},$OUT"
			cnt=$((1 + $cnt))
		  ;;
		  'td')	# Could be td/div/div/span=
			[ -z "$first" ] && first="${title##,*}" && FieldCSX "$title"
			# Get array item number
			p="${line%%=*}"
			p="${p%/td*}"
			p="${p##*#}"
			[ $p -eq 1 -a -n "$L" ] && FieldCSX "$L" && L=''
			HtmlUnraw "$v"
			[ -z "$L" ] && L="$OUT" || L="${L},$OUT"
		  ;;
		  *)
			# Find field in title
			s="$title"
			p=0
			n=0
			while [ -n "$s" -a $p -eq 0 ]; do
				n=$((1 + $n))
				[ "${s%%,*}" = "$t" ] && p=$n
				[ "${s#*,}" = "$s" ] && s='' || s="${s#*,}"
			done

			if [ $p -eq 0 ]; then
				# New field
				p=$n
				[ $p -gt $cnt ] && cnt=$p
				[ -z "$title" ] && title="$t" || title="${title},$t"
				[ -z "$L" ] && L="$v" || L="${L},$v"
			else
				# Add field - if it collides, print line and start new
				n=$p
				s="$L"
				L=''
				while [ -n "$s" -a $n -gt 0 ]; do
					f="${s%%,*}"
					[ -z "$L" ] && L="$f" || L="${L},$f"
					[ "${s#*,}" = "$s" ] && s='' || s="${s#*,}"
					n=$(($n - 1))
				done
				if [ $n -gt 0 ]; then
					# We have to create fields
					while [ $n -gt 1 ]; do
						L="${L},"
						n=$(($n - 1))
					done
					L="${L},$v"
				elif [ -z "$f" ]; then
					# Empty field, replace it
					L="${L},$v,$s"
				else
					# Collision, new row
					[ -z "$first" ] && first="${title##,*}" && FieldCSX "$title"
					[ -n "$s" ] && L="${L},${s}"
					FieldCSX "$L"
					L=''
					while [ $n -gt 0 ]; do
						L="${L},"
						n=$(($n - 1))
					done
					[ -n "$L" ] && L="${L},$v" || L="$v"
				fi
			fi
		  ;;
		esac
	fi; done
	[ -n "$L" ] && FieldCSX "$L"
 }

GetURL()
 {
	# Read a URI, URL, via curl or wget, or file, via cat, without file:// protocol
	# If file does not contain a '<', and plutil is available, use that to convert to xml1 format
	local	w f e

	if [ "${1%%://*}" != "$1" ]; then
		# 2023 - wget 403ing on reddit.com - curl with -L successful
		[ -z "$w" ] && w=$(which 'curl' 2>/dev/null) && f='-L -sA "'"$WUA"'" -H "'"Referer: $WRE"'"'
		[ -z "$w" ] && w=$(which 'links' 2>/dev/null) && f='-dump'
		[ -z "$w" ] && w=$(which 'wget' 2>/dev/null) && f='-O - -U "'"$WUA"'" "--referer='$WRE'"'
		if [ -z "$w" ]; then
			echo "Need curl or wget to get URLs"
			exit 1
		fi
		Info "Loading: $1 with $w $f"
		$w $f "$1" 2>/dev/null
		e=$?
		Dbg "Loaded ($e): $1"
	elif [ -f "$1" ]; then
		if which 'plutil' > /dev/null 2>&1; then
			# Text xml?
			if grep -q '?xml' "$1"; then
				cat "$1"
			elif plutil "$1" > /dev/null 2>&1; then
				plutil -convert 'xml1' -o - "$1"
			else
				# Some random thing
				cat "$1"
			fi
		else
			# No plutil to consider
			cat "$1"
		fi
	else
		echo "$1 not found" >&2
	fi
 }

Usage()
 {
	local	b=$(basename "$0")

	Msg '-' "Usage: $b [-a][-b][-C][-c][-d][-e][-E c][-f][-h][-i][-k][-K c][-l][-m path][-n #][-Q c][-q][-s suffix][-t][-V][-x][-?] [URL or pathToFile, will use stdin if available]

	-a	Array item numbers blank (useful when searching for any array item)
	-b	Ignore blank lines toggle (default ON)
	-C	Output CSV format
	-c	Increase rawness content fields (parsed as XML -> HTML -> raw)
	-d	Increase verbosity
	-e	Expand full paths (vs compressed . for each matching component to parent)
	-E c	Change key-value separator (default $EQC)
	-f	Stop parsing after first match (see -m) per stream. Can specify multiple for multiple matches.
	-h	Ignore search for ?xml tag to start (e.g. parse HTML)
	-i	Disable auto indexer (array detector)
	-k	Dict/key compression toggle (default !$EXPAND) - flatten dict/key into cwp${DKD}keyname/type${EQC}value
	-K	Set dict/key compression character (default $DKD)
	-l	Line break conversion to ;; for values (happens for comments by default)
	-L crc	Change ;; to this for newline replacement
	-m pth	Match - only output if /path matches given path (at head)
	-n cnt	Next n fields after match included (-s applies)
	-q	Do not prefix each line with /path${EQC}
	-Q c	Quote character before and after values
	-r	Raw mode - just to test tag parsing code
	-s sfx  Suffix match - match backside (e.g. .jpg)
	-t	Ignore content layout tags like tables, divs, spans
	-V	Output in bash executable KV format (a=b where a is a legal variable name and b a quoted value)
	-x	Output XML ... work in progress
	-y	Output YML ... work in progress

	or - cat file.xml | $b ...

	Examples
	Version of macOS thing: xmlp -m ${DKD}CFBundleShortVersionString -q -f /System/Applications/Mail.app/Contents/Info.plist
	Reddit RSS images with URLs: xmlp -n 1 -m '${SUD}entry${SUD}content${SUD}a.href' -s '.jpg' -q 'https://reddit.com/r/cityporn/rising/.rss'
	Use StdIn: cat /path/to/xmlfile.xml | xmlp
	CSV Output: xmlp -C https://www.w3schools.com/xml/plant_catalog.xml

	Issues: Wider range of XML file testing, more intelligent choices

	Version $gVersion from $gStamped
"
 }

URL=''

while getopts 'abCcdeE:fhiK:kL:lm:n:Q:qrs:tVxy?' o; do case "$o" in
  'a')	ARRAYNUMS=$((1 - $ARRAYNUMS))		;;	# Toggle showing array item numbers
  'b')	IGNOREBLANK=$((1 - $IGNOREBLANK))	;;	# Don't print lines with no value
  'C')	OUTCSV=$((1 + $OUTCSV))				;;	# Output CSV format
  'c')	RAWCONTENT=$((1 + $RAWCONTENT))		;;	# Rawness of HTML ouput (parsed, HTML, raw)
  'd')	DEBUG=$((1 + $DEBUG))				;;
  'e')	EXPAND=$((1 + $EXPAND))				;;	# Expand output from compressed -> paths -> XML (XML not yet done)
  'E')	EQC="$OPTARG"						;;	# Equal character
  'f')	ONLYFIRST=$((1 + $ONLYFIRST))		;;	# Return after finding first MATCH (-m)
  'h')	IGNOREXML=$((1 - $IGNOREXML))		;;	# Allow HTML?
  'i')	AUTOIDX=$((1 - $AUTOIDX))			;;	# Array detector
  'k')	FLATKEY=$((1 - $FLATKEY))			;;	# Toggle flatten keys in dicts
  'K')	DKD="$OPTARG"						;;	# Set dict/key delimiter
  'l')	MULTILINE=$((1 - $MULTILINE))		;;	# Toggle flatten multilines with LF -> $NEWLINE conversion
  'L')	NEWLINE="$OPTARG"					;;	# \n replacement string
  'm')	MATCH="$OPTARG"						;;	# Path match string
  'n')	NEXTS="$OPTARG"						;;	# Include n lines after SUFFIX is hit
  'Q')	QUOTE="$OPTARG"						;;	# Quote output values
  'q')	ONLYVALUE=$((1 - $ONLYVALUE))		;;	# No path prefix
  'r')	READRAW=1							;;	# Tag parser test
  's')	SUFFIX="$OPTARG"					;;	# Only print lines whose value ends with SUFFIX (e.g. .jpg)
  't')	IGNORELAYOUT=$((1 - $IGNORELAYOUT))	;;	# Ignore content layout
  'V')	OUTKVO=$((1 + $OUTKVO))				;;	# Output KVO format
  'x')	OUTXML=$((1 + $OUTXML))				;;	# Output XML again
  'y')	OUTYML=$((1 + $OUTYML))				;;	# Output YML
  '?')	Usage; exit 0						;;
  *)	Msg "Unknown option: $1"			;;
esac; done
shift $(($OPTIND - 1))

for u in "$@"; do
        URL="$URL
$u"
done

# Set some KVO parameters
if [ $OUTKVO -ne 0 ]; then
	AUTOIDX=1
	EXPAND=1
	QUOTE=\'
	# DDD replacements
	SUD='_'
	SND='_'
	PRD='_'
	# Can not be in above for DDD reasons
	IXD='x'
fi

# If we get a URL - use it, else if no stdin - Usage - else process stdin
if [ -n "$URL" ]; then
	echo "$URL" | while read line; do if [ -n "$line" ]; then
		GetURL "$line" | DoTheRightThing
	fi; done
elif [ -t 0 ]; then
	Usage
else
	DoTheRightThing
fi