dev/markdown-checks

#!/usr/bin/env python

import collections as cs, itertools as it, hashlib as hl
import os, sys, re, string, base64, pathlib as pl


checks_all = dict(
	link_refs='Non-inline links like "[mylink]" have exactly one "[mylink]: URL" line for each.',
	link_refs_unneeded='Inline URLs like "[mylink](URL)" when "[mylink]: URL" is also in the md.',
	link_anchor='Not all headers have <a name=hdr-...> line. See also -a/--add-anchors option.',
	link_anchor_match='Mismatch between header-anchors and hashtag-links pointing to them.',
	link_files='Relative links point to an existing file (relative to them).',
	link_files_weird='Relative links that start with non-letter/digit/hashmark.',
	link_files_git='If .md file is in a git repo, warn if linked files are not under git control.',
	link_dups='Multiple same-title links with URLs.',
	quirks='Minor non-obvious md syntax quirks which are almost never intentional.',
	rx_in_code='Command-line-specified regexp (if any) detected inside code block(s).',
	tabs='Make sure md file contains no tab characters.',
	syntax='Any kind of incorrect syntax, e.g. blocks opened and not closed and such.' )


class adict(dict):
	def __init__(self, *args, **kws):
		super().__init__(*args, **kws)
		self.__dict__ = self

class UnbalancedTokens(Exception): pass
def extract_toplevel(s, ab, strict=True):
	n, a, (ca, cb) = 0, 0, ab
	for b, c in enumerate(s):
		if c == ca:
			if not n: a = b + 1
			n += 1
		elif c == cb:
			n -= 1
			if not n: yield a - 1, b + 1, s[a:b]
			elif n < 0:
				if strict: raise UnbalancedTokens
				n = 0
	if n and strict: raise UnbalancedTokens

def b64_hash(data, n=4, person=b'mdchecks', **b2_kws):
	if not isinstance(data, bytes): data = data.encode()
	digest = hl.blake2s(data, person=person, **b2_kws).digest()
	while True:
		b64 = base64.urlsafe_b64encode(digest).decode().replace('_', '').replace('-', '')
		if len(b64) >= n: return b64[:n]
		digest = hl.blake2s(digest, person=person, **b2_kws).digest()


def md_esc_strip(s, rev=False, _esc_pats = list(
		(p, chr(0xf010+n)) for n, p in enumerate(['\\\\', *r'\[ \] \( \)'.split()]) )):
	for p, rep in _esc_pats: s = s.replace(*((rep, p) if rev else (p, rep)))
	return s
md_esc_restore = lambda s: md_esc_strip(s, True)

def md_clean(md, code_re, errs):
	'Return markdown with all code-blocks stripped'
	st = adict( list_level=0, list_start=False,
		sep=None, code=None, code_inline=None, code_prefix=None )
	lines_cut, lines = list(), f'\n{md}'.splitlines() # \n to skip line-number=0 easily

	def _find_code_start(n, line):
		if (c := line.find('```')) != -1:
			lines[n], st.code = line[:c], n + 1; return
		for s in '``', '`':
			while (c := line.find(s)) != -1:
				lines[n], mn = re.subn(fr'{s}.+?{s}', '', line)
				if mn: line = lines[n]
				else: st.code, st.code_inline, lines[n] = n + 1, s, line[:c]; return
		if st.sep and line.startswith(pre := ' '*4):
			st.code, st.code_prefix = n, pre

	def _find_code_end(n, line):
		if (c := line.find('```')) != -1:
			if st.code_inline:
				errs.append(adict( n=n, t='syntax',
					s=f'Inline code-span {st.code_inline}-block opened on'
						f' line-{st.code}, but not closed until multiline ```-block' ))
				st.code, st.code_inline = n, lines_cut.append((st.code, n))
				return # still in the code block, just a different one
			st.code = st.code_inline = lines_cut.append((st.code, n))
			line = lines[n] = line[c+3:]
			if line.strip():
				errs.append(adict( n=n, t='syntax',
					s=f'Junk after ``` ending of the code block: {line!r}' ))
		elif st.code_inline and (c := line.find(st.code_inline)) != -1:
			line = lines[n] = line[c + len(st.code_inline):]
			st.code = st.code_inline = lines_cut.append((st.code, n))
		elif st.code_prefix and not line.startswith(st.code_prefix):
			if not st.sep:
				errs.append(adict( n=n, t='syntax',
					s='End of indented code block without empty-line separator' ))
			st.code = st.code_prefix = lines_cut.append((st.code, n))
		if not st.code and line: _find_code_start(n, line) # leftover line part might have stuff

	for n, line in enumerate(lines):
		if not line.strip(): st.sep = True; continue
		if '\t' in line: errs.append(adict(n=n, t='tabs', s='Tab character(s)'))
		if st.sep or st.code: st.list_start = False

		if not st.code:
			if st.list_level > 0:
				level = st.list_level
				while level and line.startswith(' '*4): line = line[4:]; level -= 1
				if st.list_start and level == 1 and line.startswith('  '): level -= 1
				st.list_level -= level
			if line.startswith('- '): st.list_level += 1; st.list_start = True
			_find_code_start(n, line)

		else:
			_find_code_end(n, line)
			if st.code and code_re and code_re.search(line):
				errs.append(adict( n=n, t='rx_in_code',
					s=f'Command-line --code-rx line-match inside code block: {line!r}' ))

		st.sep = False

	for n, line in enumerate(lines):
		lines[n] = n, lines[n] # preserve numbers before removing chunks
	for a, b in reversed(lines_cut): lines[a:b] = list()
	if st.code and not st.code_prefix: errs.append(adict( n=n,
		t='syntax', s=f'Unclosed code-block started at line-{st.code}' ))
	return lines


def md_check_quirks(md_lines, errs):
	for n, line in md_lines:
		if line.strip().startswith(r'\['):
			# \[stuff\] at BOL translates into centered "stuff" line on codeberg
			errs.append(adict( n=n, t='quirks',
				s=fr'Line starting with \[ can turn into some weird centered-text thing' ))


def md_check_header_anchors(md_lines, errs, name_max_len=40):
	'Check/return a list of header/anchor lines that needs some kind of fixing'
	# gitea/forgejo/codeberg used to prefix rendered a-names with "user-content-"
	anchors, anchor_re = dict(), re.compile(r'<a name=((?:user-content-)?hdr(x?)-(\S+)>)</a>')
	str_map = dict((c, c) for c in string.ascii_lowercase + string.digits + '-._~')
	def _line_prev(last_offset):
		if k - last_offset < 0: return ''
		n_last, line_prev = md_lines[k - last_offset]
		if n_last != n - last_offset: return '' # discontinuity = no prev line
		return line_prev
	for k, (n, line) in enumerate(md_lines):
		if n == 1: continue # don't add anchor for first-line header, if any
		if not (m := re.match(r'#+\s+(\S.*)', line)): continue
		ao, hdr = 0, md_esc_strip(m[1].strip())
		for a, b, link in reversed(list(extract_toplevel(hdr, '[]', False))):
			if urls := list(extract_toplevel(ls := hdr[b:], '()', False)):
				ua, ub, url = urls[0]
				if not ua: hdr = hdr[:b+ua] + hdr[b+ub:]
			hdr = hdr[:a] + link + hdr[b:] # strip link syntax and urls
		hdr = md_esc_restore(hdr)
		name = ''.join(str_map.get(c, '_') for c in hdr.lower())
		name = name.lstrip('_') # for "--- Header" and such custom prefixes
		if len(name) > name_max_len: name = re.sub(r'_+', '_', name)
		if len(name) > name_max_len + 5: # truncate long-header anchor-names
			name = name[:name_max_len] + '.' + b64_hash(name[name_max_len:], 4)
		while (a_list := anchor_re.findall(_line_prev(ao := ao + 1))) and (an := n - ao):
			# Multiple anchors are for diff renderers, e.g. github + codeberg
			if len(set(a[2] for a in a_list)) != 1: a_tag = a_nolink = a_name = ''
			else: a_tag, a_nolink, a_name = a_list[0]
			if not (a_match := (a_name == name)):
				errs.append(adict( n=an, t='link_anchor', s='Anchor-name'
					f' mismatch for header [ {hdr} ]: new={name!r} old={a_name!r}' ))
			a_name = ('hdr-' if not a_nolink else 'hdrx-') + name
			if a_chk := anchors.get(a_tag):
				errs.append(adict( n=an, t='link_anchor',
					s=f'Duplicate anchor with header on line-{a_chk.hdr_n}: {a_name}' ))
			anchors[a_tag] = adict( n=n - ao, name=a_name,
				hdr=hdr, hdr_n=n, exists=True, match=a_match, nolink=a_nolink )
		if ao == 1:
			errs.append(adict( n=n, t='link_anchor',
				s=f'Missing anchor before header [ {name} ]: {hdr}' ))
			anchors[a_name] = adict( n=n, name=(a_name := f'hdr-{name}'),
				hdr=hdr, hdr_n=n, exists=False, match=False, nolink=False )
	anchor_map = dict((a.n, a) for a in anchors.values())
	for n, line in md_lines:
		if n in anchor_map or not (m := anchor_re.search(line)): continue
		errs.append(adict( n=n, t='link_anchor',
			s=f'Bogus/leftover anchor-line without a header: {line}' ))
	return anchor_map

def md_parse_misc_anchors(md_lines, anchor_map):
	'Find/return a list of anchors unrelated to headers'
	anchors, anchor_re = dict(), re.compile(r'<a name=(\S+)></a>')
	for n, line in md_lines:
		if n in anchor_map or not (m := anchor_re.search(line)): continue
		if re.match(r'(user-content-)?hdrx?-', name := m[1]): continue
		anchors[n] = adict(name=name, n=n)
	return anchors


def md_check_links(md_lines, errs):
	'''Parse/check links in a cleaned-up/numbered md lines without code blocks.
		Returns {target: [links...]} to run file/path/url/git and such checks on.'''
	links, link_urls, st = dict(), dict(), adict(indented_url=None)

	for n, line_raw in md_lines:
		if not (line := md_esc_strip(line_raw.strip())): continue

		if (link := st.indented_url) and line_raw.startswith(' '):
			link.url, st.indented_url = line, None
		elif m := re.fullmatch(r'\[(.*?)\]:(\s+(\S.*))?', line):
			if link_chk := link_urls.get(title := md_esc_restore(m[1])):
				errs.append(adict( n=n, t='link_refs',
					s=f'Duplicate URL for [{title}], matching earlier one on line-{link_chk.n}' ))
			link = link_urls[title] = adict(n=n, title=title, url=md_esc_restore(m[3] or ''))
			if not m[2]: st.indented_url = link
		elif '[' not in line: continue

		try: titles = list(extract_toplevel(line, '[]'))
		except UnbalancedTokens:
			errs.append(adict(n=n, t='syntax', s='Unbalanced [link]-brackets')); continue

		for a, b, title in titles:
			if not title: continue # [] after image-links and such
			if title != title.strip(): continue
			link = adict(n=n, title=(title := md_esc_restore(title)), url=None, dups=0)
			if urls := list(extract_toplevel(ls := line[b:], '()', False)):
				ua, ub, url = urls[0]
				if not ua: link.url = md_esc_restore(url)
			if (link.url or '').startswith('#'): # these are special - for anchor-checks only
				title = link.title = f'\uf001{link.title}'
			if (link_chk := links.get(title)) and (link_chk.url or link.url or link_chk.dups):
				errs.append(adict( n=n, t='link_dups',
					s=f'Duplicate non-uniform link [{link.title}], matching one on line-{link_chk.n}' ))
				link.dups = link_chk.dups + 1
				if not link_chk.url: link.url = None # one of them needs an url
			links[title] = link

	link_map = cs.defaultdict(list)
	for link in links.values():
		if not link.url and not (url := link_urls.get(link.title)):
			errs.append(adict( n=link.n, t='link_refs',
				s=f'Link [{link.title}] has no corresponding URL for it' ))
		elif link.url: link_map[link.url].append(link)
		else: url.used = True; link_map[url.url].append(link)

	for url in link_urls.values():
		if url.get('used'): continue
		errs.append(adict( n=url.n, t='link_refs_unneeded',
			s=f'URL for link [{url.title}] is not used anywhere' ))

	return link_map


def md_check_files(p_base, p_git, git_files, link_map, errs):
	for url, links in link_map.items():
		if re.match(r'(?i)(#|https?://)', url): continue
		p_weird = not re.match(r'[\w.]', url)
		p_nx = not (p := (p_base / url.split('#', 1)[0]).resolve()).exists()
		p_nogit = p not in git_files
		for link in links:
			if p_weird:
				errs.append(adict( n=link.n, t='link_files_weird',
					s=f'URL for link [{link.title}] does not start with letter/digit/hashmark [ {url} ]' ))
			if p_nx:
				errs.append(adict( n=link.n, t='link_files',
					s=f'File/dir for link [{link.title}] does not exist [ {url} ]: {p}' ))
			elif p_nogit:
				errs.append(adict( n=link.n, t='link_files_git', s='Git repo'
					f' [ {p_git} ] does not have file/dir for link [{link.title}] target [ {url} ]: {p}' ))


def path_names(*paths):
	'Return short and distinct names from last path components, in the same order.'
	names, paths = cs.Counter(p.name for p in paths), list(map(str, paths))
	while True:
		name, n = names.most_common(1)[0]
		if n == 1: break
		del names[name]
		upd = sorted((
				'/'.join(filter(None, p.rsplit('/', name.count('/')+2)[1:]))
				for p in paths if p.endswith(f'/{name}') ),
			key=lambda name: name.count('/') )
		if len(upd) <= 1: raise AssertionError(name, upd)
		names.update(upd)
	return list(next(n for n in names if p == n or p.endswith(f'/{n}')) for p in paths)

def get_git_repo_files(p):
	p_git = p.resolve()
	while True:
		if ((p_git := p_git.parent) / '.git').exists(): p_git_found = True; break
		if str(p_git) == '/': return None, None
	import subprocess as sp
	p_root, git_files, git_ls = p.parent, set(), sp.run(
		['git', 'ls-files', '-z','--deduplicate'],
		cwd=p_git, stdout=sp.PIPE ).stdout.decode().split('\0')
	for p in git_ls:
		if not (p := (p_git / p).resolve()).exists(): continue
		while True:
			git_files.add(p)
			if (p := p.parent) == p_git or str(p) == '/': break
	return p_git, git_files

def main(argv=None):
	checks = adict(checks_all)

	import argparse, textwrap
	dd = lambda text: re.sub( r' \t+', ' ',
		textwrap.dedent(text).strip('\n') + '\n' ).replace('\t', '  ')
	parser = argparse.ArgumentParser(
		formatter_class=argparse.RawTextHelpFormatter,
		usage='%(prog)s [opts] [--] [file.md ...]',
		description=dd('''
			Check all specified markdown files for common issues.
			Returns non-zero exit code with stderr output if any of the checks detect issues.

			Currently checks for the following issues (--checks-list also lists those):\n
			{link_list}\n
			Specific checks can be enabled/disabled via command-line options.''')
			.format(link_list='\n'.join(f' - {k.replace("_", "-")} :: {v}' for k,v in checks.items())))
	parser.add_argument('md_files', nargs='*',
		help='Markdown file(s) to validate. Default: README.md')

	parser.add_argument('-c', '--checks', metavar='chk1[ chk2...]', help=dd('''
		Names of specific markdown checks to enable.
		Full list of checks can be doung in -h/--help script description or via --checks-list option.
		Can be separated by commas and/or spaces (make sure to quote spaces in shell).
		Default is to enable all supported checks.'''))
	parser.add_argument('-C', '--checks-disable', metavar='chk1[ chk2...]', help=dd('''
		Same as -c/--checks option above, but to disable a list of specific check(s).
		If specified along with -c/--checks option, disables checks from a list filtered by it.'''))
	parser.add_argument('--checks-list', action='store_true',
		help='List of all checks that script supports and exit. All of them are enabled by default.')
	parser.add_argument('--code-rx', metavar='regexp', help=dd('''
		Python re-module regexp to look for in code-blocks' lines and warn about, if found.
		Intended for detecting when/where specific known-markdown parts get into
			code-blocks unintentionally (by accident), e.g. through unbalanced ```...```,
			as a kind of sanity-check that opening/closing block-quotes haven't been inverted.'''))
	parser.add_argument('-a', '--add-anchors', action='store_true', help=dd('''
		Check that every header has an <a name=hdr-...>
			anchor line before it, add missing ones and exit.
		Only #-prefix headers are handled as such, all underlined ones are ignored.
		This modifies files and suppresses normal script operation mode.
		Exit code is 0 if no anchors were added, and non-zero otherwise.'''))

	opts = parser.parse_args(sys.argv[1:] if argv is None else argv)

	if opts.checks_list:
		print()
		for k, v in checks_all.items(): print(f'{k.replace("_", "-")} :: {v}')
		return print()
	if opts.checks:
		check_filter = opts.checks.lower().replace(',', ' ').replace('-', '_').split()
		for k in check_filter:
			if k not in checks_all: parser.error(f'Unsupported -c/--checks value: {k}')
		checks = adict((k,v) for k,v in checks.items() if k in check_filter)
	if opts.checks_disable:
		for k in opts.checks_disable.lower().replace(',', ' ').replace('-', '_').split():
			if k not in checks_all: parser.error(f'Unsupported -c/--checks-disable value: {k}')
			if k in checks: del checks[k]

	err_code, code_re = 0, (rx := opts.code_rx) and re.compile(rx)
	if not opts.md_files: opts.md_files = ['README.md']
	for p, p_name in zip(ps := list(map(pl.Path, opts.md_files)), path_names(*ps)):
		md_lines = md_clean((p := pl.Path(p)).read_text(), code_re, errs := list())
		md_check_quirks(md_lines, errs)

		anchor_map = md_check_header_anchors(md_lines, errs)
		if opts.add_anchors:
			for n, a in list(anchor_map.items()):
				if a.match: del anchor_map[n] # no need to fix those
			p_tmp = p.parent / f'{p.name}.md-checks.new'
			try:
				with p.open() as src, p_tmp.open('w') as dst:
					ins_n = ins_name = None
					for n, line in enumerate(src, 1):
						if (a := anchor_map.get(n)) and not (a.exists and a.match):
							ins_n, ins_name = a.hdr_n, a.name
							if a.exists: continue
						if n == ins_n: dst.write(f'<a name={ins_name}></a>\n')
						dst.write(line)
				p_tmp.rename(p); p_tmp = None
			finally:
				if p_tmp: p_tmp.unlink(missing_ok=True)
			if anchor_map:
				print( f'{p_name} :: Fixed {len(anchor_map)}'
					' missing/mismatching anchor(s)', file=sys.stderr )
				err_code = 1
			continue

		link_map = md_check_links(md_lines, errs)

		anchor_names = dict((a.name, a) for a in anchor_map.values())
		anchor_names.update( (a.name, a) for a in
			md_parse_misc_anchors(md_lines, anchor_map).values() )
		for url, links in link_map.items():
			if not (name := (url or '')).startswith('#'): continue
			if not (a := anchor_names.get(name := name[1:])):
				for link in links: errs.append(adict( n=link.n,
					t='link_anchor_match', s=f'Link to a non-existing anchor: {url}' ))
			else: a.setdefault('c', 0); a.c += 1
		for name, a in anchor_names.items():
			if not a.get('c') and not a.get('nolink'): errs.append(adict(
				n=a.n, t='link_anchor_match', s=f'Anchor with no links to it: {name}' ))

		p_git, git_files = get_git_repo_files(p)
		if p_git and not git_files:
			errs.append(adict( n=0, t='link_files_git',
				s=f'Detected git repository [ {p_git} ] appears to be empty' ))
		if git_files: md_check_files(p.resolve().parent, p_git, git_files, link_map, errs)

		for err in errs:
			if err.t not in checks: continue
			print(f'{p_name}:{err.n} :: {err.t} :: {err.s}', file=sys.stderr); err_code = 1

	return err_code

if __name__ == '__main__':
	try: sys.exit(main())
	except BrokenPipeError: # stdout pipe closed
		os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
		sys.exit(1)