catch2/scripts/updateDocumentToC.py

448 lines
14 KiB
Python
Raw Normal View History

#!/usr/bin/env python
#
# updateDocumentToC.py
#
# Insert table of contents at top of Catch markdown documents.
#
# This script is distributed under the GNU General Public License v3.0
#
# It is based on markdown-toclify version 1.7.1 by Sebastian Raschka,
# https://github.com/rasbt/markdown-toclify
#
from __future__ import print_function
import argparse
import glob
import os
import re
import sys
from scriptCommon import catchPath
# Configuration:
minTocEntries = 4
headingExcludeDefault = [1,3,4,5] # use level 2 headers for at default
headingExcludeRelease = [2,3,4,5] # use level 1 headers for release-notes.md
documentsDefault = os.path.join(os.path.relpath(catchPath), 'docs/*.md')
releaseNotesName = 'release-notes.md'
contentTitle = '**Contents**'
contentLineNo = 4
contentLineNdx = contentLineNo - 1
# End configuration
VALIDS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-&'
def readLines(in_file):
"""Returns a list of lines from a input markdown file."""
with open(in_file, 'r') as inf:
in_contents = inf.read().split('\n')
return in_contents
def removeLines(lines, remove=('[[back to top]', '<a class="mk-toclify"')):
"""Removes existing [back to top] links and <a id> tags."""
if not remove:
return lines[:]
out = []
for l in lines:
if l.startswith(remove):
continue
out.append(l)
return out
def removeToC(lines):
"""Removes existing table of contents starting at index contentLineNdx."""
if not lines[contentLineNdx ].startswith(contentTitle):
return lines[:]
result_top = lines[:contentLineNdx]
pos = contentLineNdx + 1
while lines[pos].startswith('['):
pos = pos + 1
result_bottom = lines[pos + 1:]
return result_top + result_bottom
def dashifyHeadline(line):
"""
Takes a header line from a Markdown document and
returns a tuple of the
'#'-stripped version of the head line,
a string version for <a id=''></a> anchor tags,
and the level of the headline as integer.
E.g.,
>>> dashifyHeadline('### some header lvl3')
('Some header lvl3', 'some-header-lvl3', 3)
"""
stripped_right = line.rstrip('#')
stripped_both = stripped_right.lstrip('#')
level = len(stripped_right) - len(stripped_both)
stripped_wspace = stripped_both.strip()
# character replacements
replaced_colon = stripped_wspace.replace('.', '')
replaced_slash = replaced_colon.replace('/', '')
rem_nonvalids = ''.join([c if c in VALIDS
else '-' for c in replaced_slash])
lowered = rem_nonvalids.lower()
dashified = re.sub(r'(-)\1+', r'\1', lowered) # remove duplicate dashes
dashified = dashified.strip('-') # strip dashes from start and end
# exception '&' (double-dash in github)
dashified = dashified.replace('-&-', '--')
return [stripped_wspace, dashified, level]
def tagAndCollect(lines, id_tag=True, back_links=False, exclude_h=None):
"""
Gets headlines from the markdown document and creates anchor tags.
Keyword arguments:
lines: a list of sublists where every sublist
represents a line from a Markdown document.
id_tag: if true, creates inserts a the <a id> tags (not req. by GitHub)
back_links: if true, adds "back to top" links below each headline
exclude_h: header levels to exclude. E.g., [2, 3]
excludes level 2 and 3 headings.
Returns a tuple of 2 lists:
1st list:
A modified version of the input list where
<a id="some-header"></a> anchor tags where inserted
above the header lines (if github is False).
2nd list:
A list of 3-value sublists, where the first value
represents the heading, the second value the string
that was inserted assigned to the IDs in the anchor tags,
2018-03-07 10:08:35 +01:00
and the third value is an integer that represents the headline level.
E.g.,
[['some header lvl3', 'some-header-lvl3', 3], ...]
"""
out_contents = []
headlines = []
for l in lines:
saw_headline = False
orig_len = len(l)
l_stripped = l.lstrip()
if l_stripped.startswith(('# ', '## ', '### ', '#### ', '##### ', '###### ')):
# comply with new markdown standards
# not a headline if '#' not followed by whitespace '##no-header':
if not l.lstrip('#').startswith(' '):
continue
# not a headline if more than 6 '#':
if len(l) - len(l.lstrip('#')) > 6:
continue
# headers can be indented by at most 3 spaces:
if orig_len - len(l_stripped) > 3:
continue
# ignore empty headers
if not set(l) - {'#', ' '}:
continue
saw_headline = True
dashified = dashifyHeadline(l)
if not exclude_h or not dashified[-1] in exclude_h:
if id_tag:
id_tag = '<a class="mk-toclify" id="%s"></a>'\
% (dashified[1])
out_contents.append(id_tag)
headlines.append(dashified)
out_contents.append(l)
if back_links and saw_headline:
out_contents.append('[[back to top](#table-of-contents)]')
return out_contents, headlines
def positioningHeadlines(headlines):
"""
Strips unnecessary whitespaces/tabs if first header is not left-aligned
"""
left_just = False
for row in headlines:
if row[-1] == 1:
left_just = True
break
if not left_just:
for row in headlines:
row[-1] -= 1
return headlines
def createToc(headlines, hyperlink=True, top_link=False, no_toc_header=False):
"""
Creates the table of contents from the headline list
that was returned by the tagAndCollect function.
Keyword Arguments:
headlines: list of lists
e.g., ['Some header lvl3', 'some-header-lvl3', 3]
hyperlink: Creates hyperlinks in Markdown format if True,
e.g., '- [Some header lvl1](#some-header-lvl1)'
top_link: if True, add a id tag for linking the table
of contents itself (for the back-to-top-links)
no_toc_header: suppresses TOC header if True.
Returns a list of headlines for a table of contents
in Markdown format,
e.g., [' - [Some header lvl3](#some-header-lvl3)', ...]
"""
processed = []
if not no_toc_header:
if top_link:
processed.append('<a class="mk-toclify" id="table-of-contents"></a>\n')
processed.append(contentTitle + '<br>')
for line in headlines:
if hyperlink:
item = '[%s](#%s)' % (line[0], line[1])
else:
item = '%s- %s' % ((line[2]-1)*' ', line[0])
processed.append(item + '<br>')
processed.append('\n')
return processed
def buildMarkdown(toc_headlines, body, spacer=0, placeholder=None):
"""
Returns a string with the Markdown output contents incl.
the table of contents.
Keyword arguments:
toc_headlines: lines for the table of contents
as created by the createToc function.
body: contents of the Markdown file including
ID-anchor tags as returned by the
tagAndCollect function.
spacer: Adds vertical space after the table
of contents. Height in pixels.
placeholder: If a placeholder string is provided, the placeholder
will be replaced by the TOC instead of inserting the TOC at
the top of the document
"""
if spacer:
spacer_line = ['\n<div style="height:%spx;"></div>\n' % (spacer)]
toc_markdown = "\n".join(toc_headlines + spacer_line)
else:
toc_markdown = "\n".join(toc_headlines)
if placeholder:
body_markdown = "\n".join(body)
markdown = body_markdown.replace(placeholder, toc_markdown)
else:
body_markdown_p1 = "\n".join(body[:contentLineNdx ]) + '\n'
body_markdown_p2 = "\n".join(body[ contentLineNdx:])
markdown = body_markdown_p1 + toc_markdown + body_markdown_p2
return markdown
def outputMarkdown(markdown_cont, output_file):
"""
Writes to an output file if `outfile` is a valid path.
"""
if output_file:
with open(output_file, 'w') as out:
out.write(markdown_cont)
def markdownToclify(
input_file,
output_file=None,
min_toc_len=2,
github=False,
back_to_top=False,
nolink=False,
no_toc_header=False,
spacer=0,
placeholder=None,
exclude_h=None):
""" Function to add table of contents to markdown files.
Parameters
-----------
input_file: str
Path to the markdown input file.
2018-03-07 10:08:35 +01:00
output_file: str (default: None)
Path to the markdown output file.
min_toc_len: int (default: 2)
Miniumum number of entries to create a table of contents for.
github: bool (default: False)
Uses GitHub TOC syntax if True.
back_to_top: bool (default: False)
Inserts back-to-top links below headings if True.
nolink: bool (default: False)
Creates the table of contents without internal links if True.
no_toc_header: bool (default: False)
Suppresses the Table of Contents header if True
spacer: int (default: 0)
Inserts horizontal space (in pixels) after the table of contents.
placeholder: str (default: None)
Inserts the TOC at the placeholder string instead
of inserting the TOC at the top of the document.
exclude_h: list (default None)
Excludes header levels, e.g., if [2, 3], ignores header
levels 2 and 3 in the TOC.
Returns
-----------
changed: Boolean
True if the file has been updated, False otherwise.
"""
cleaned_contents = removeLines(
removeToC(readLines(input_file)),
remove=('[[back to top]', '<a class="mk-toclify"'))
processed_contents, raw_headlines = tagAndCollect(
cleaned_contents,
id_tag=not github,
back_links=back_to_top,
exclude_h=exclude_h)
# add table of contents?
if len(raw_headlines) < min_toc_len:
processed_headlines = []
else:
leftjustified_headlines = positioningHeadlines(raw_headlines)
processed_headlines = createToc(
leftjustified_headlines,
hyperlink=not nolink,
top_link=not nolink and not github,
no_toc_header=no_toc_header)
if nolink:
processed_contents = cleaned_contents
cont = buildMarkdown(
toc_headlines=processed_headlines,
body=processed_contents,
spacer=spacer,
placeholder=placeholder)
if output_file:
outputMarkdown(cont, output_file)
def isReleaseNotes(f):
return os.path.basename(f) == releaseNotesName
def excludeHeadingsFor(f):
return headingExcludeRelease if isReleaseNotes(f) else headingExcludeDefault
def updateSingleDocumentToC(input_file, min_toc_len, verbose=False):
"""Add or update table of contents in specified file. Return 1 if file changed, 0 otherwise."""
if verbose :
print( 'file: {}'.format(input_file))
output_file = input_file + '.tmp'
markdownToclify(
input_file=input_file,
output_file=output_file,
min_toc_len=min_toc_len,
github=True,
back_to_top=False,
nolink=False,
no_toc_header=False,
spacer=False,
placeholder=False,
exclude_h=excludeHeadingsFor(input_file))
# prevent race-condition (Python 3.3):
if sys.version_info >= (3, 3):
os.replace(output_file, input_file)
else:
os.remove(input_file)
os.rename(output_file, input_file)
return 1
def updateDocumentToC(paths, min_toc_len, verbose):
"""Add or update table of contents to specified paths. Return number of changed files"""
n = 0
for g in paths:
for f in glob.glob(g):
if os.path.isfile(f):
n = n + updateSingleDocumentToC(input_file=f, min_toc_len=min_toc_len, verbose=verbose)
return n
def updateDocumentToCMain():
"""Add or update table of contents to specified paths."""
parser = argparse.ArgumentParser(
description='Add or update table of contents in markdown documents.',
epilog="""""",
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
'Input',
metavar='file',
type=str,
nargs=argparse.REMAINDER,
help='files to process, at default: docs/*.md')
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='report the name of the file being processed')
parser.add_argument(
'--min-toc-entries',
dest='minTocEntries',
default=minTocEntries,
type=int,
metavar='N',
2018-03-07 10:08:35 +01:00
help='the minimum number of entries to create a table of contents for [{default}]'.format(default=minTocEntries))
parser.add_argument(
'--remove-toc',
action='store_const',
dest='minTocEntries',
const=99,
help='remove all tables of contents')
args = parser.parse_args()
paths = args.Input if args.Input else [documentsDefault]
changedFiles = updateDocumentToC(paths=paths, min_toc_len=args.minTocEntries, verbose=args.verbose)
if changedFiles > 0:
print( "Processed table of contents in " + str(changedFiles) + " file(s)" )
else:
print( "No table of contents added or updated" )
if __name__ == '__main__':
updateDocumentToCMain()
# end of file