make_md_to_rst.py

"""
The script generates a folder of rest files from a folder of markdown files.
Markdown Hyperlinks between the files in the folder get converted to rest links so that they function properly in a
sphinx generated html build obtained from the resulting rest folder.
"""

import os
import click
import shutil
from m2r import parse_from_file
import re

new_line_re = "(\r\n|[\r\n])"


def rebaseAbsRoot(path, target, root):
    """
    If path and target intersect at root, return relative path from path to target
    Functionality is limited.
    path and target must be path strings pointing at FILES!
    root is only allowed to appear once in every path
    you cant root to os.sep (no folder seperators allowed in the root string)
    """

    p = path.find(root)
    t = target.find(root)
    if (p == -1) or (t == -1) or (".." in path):
        return target

    path = path[path.find(root) :].split(os.sep)
    target = target[target.find(root) :].split(os.sep)
    # remove common path chunks:
    while path[0] == target[0]:
        del path[0]
        del target[0]

    up_steps = (len(path) - 1) * f"..{os.sep}"
    down_steps = os.sep.join(target)
    new_path = os.path.join(up_steps, down_steps)
    return new_path


def fixTables(f_rst):
    body_re = f"((.+){new_line_re})*{new_line_re}((.+){new_line_re})*"
    tables = list(re.finditer(f"\.\. list-table::{new_line_re}" + body_re, f_rst))
    for t in tables:
        tab = t[0]

        def pic_repl(match):
            leading = match.groupdict()["list_level"]
            pic_dir = match.groupdict()["pic_directive"]
            pic_pad = re.match("^[ ]*", pic_dir).span()[1]
            pic_dir = re.sub(f'{" " * pic_pad}', " " * len(leading), pic_dir)
            pic_dir = leading + pic_dir[len(leading) :]
            end_space = re.search(f"{new_line_re}[ ]*$", match[0])
            if end_space:
                pic_dir = re.sub(f"{new_line_re}[ ]*$", end_space[0], pic_dir)
            return pic_dir

        messy_re = (
            f"(?P<list_level>.*){new_line_re}(?P<pic_directive>[ ]*.. image::[^*-]*)"
        )
        # using while loop cause messed pic patterns overlap
        tab, repnum = re.subn(messy_re, pic_repl, tab, 1)
        while repnum:
            tab, repnum = re.subn(messy_re, pic_repl, tab, 1)

        bullets = tab.split("   *")[1:]
        items = [bullet.split("     -") for bullet in bullets]
        last_items = items[-1]
        item_num = len(items[0])
        last_item_num = len(last_items)
        if item_num > last_item_num:
            has_content = (
                len([content for content in last_items if re.search("[^\s-]", content)])
                > 0
            )
            if has_content:
                # append empty cells
                tab += "     - \n" * (item_num - last_item_num)
            else:
                # delete last row (using replace to avoid false meta char interpretation
                tab = tab.replace(bullets[-1][0], "")

        bullet_num = len(
            list(re.finditer(f"   \*(?P<items>([ ]+-.*{new_line_re})*)", tab))
        )
        if bullet_num == 1:
            # fix empty body table error:
            tab = re.sub(":header-rows: [0-9]", ":header-rows: 0", tab)

        if tab != t[0]:
            f_rst = f_rst.replace(t[0], tab)

    return f_rst


def fixLinks(f_rst, f, targetpath):
    md_links = list(
        re.finditer(
            "(?P<numbered>\. )?`(?P<link_name>[^<`]*) <(?P<md_link>\S*.md)?(#)?(?P<section>[^>]*)?>`_?",
            f_rst,
        )
    )
    for link in md_links:
        # change directory:
        link_path = link.groupdict()["md_link"]
        if not link_path:
            link_path = f
        # change directory to point at temporal rest dir (if link isnt relative):
        if os.path.dirname(link_path) != "":
            link_path = os.path.join(
                os.path.dirname(link_path) + "_m2r", os.path.basename(link_path)
            )
        # rebase the link to relative link if its not
        link_path = rebaseAbsRoot(os.path.join(targetpath, f), link_path, "sphinx-doc")
        # remove extension name (rst syntax)
        link_path = re.sub("\.md$", "", link_path)
        if link.groupdict()["section"]:
            # while document links have to be relative - section links have to be absolute from sphinx doc dir -
            # markdown space representation by dash has to be removed...
            abs_path = os.path.basename(os.path.abspath(""))
            abs_path = targetpath[targetpath.find(abs_path) + len(abs_path) + 1 :]
            link_path = os.path.join(abs_path, os.path.basename(link_path))
            role = ":ref:"
            section = ":" + link.groupdict()["section"].replace("-", " ")
            # one more regex spell for the sake of numbered section linking:
            if link.groupdict()["numbered"]:
                section = re.sub("(:[0-9]+)", "\g<1>.", section)
        else:
            role = ":doc:"
            section = ""

        f_rst = re.sub(
            f'`(?P<link_name>{link.groupdict()["link_name"]}) '
            f'<({link.groupdict()["md_link"]})?(#[^>]*)?>`(_)?',
            r"{}`\g<link_name> <{}{}>`".format(role, link_path, section),
            f_rst,
        )
    return f_rst


@click.command()
@click.option(
    "-p",
    "--mdpath",
    type=str,
    required=True,
    default="sphinx-doc/getting_started_md",
    help="Relative path to the folder containing the .md files to be converted (relative to sphinx root).",
)
@click.option(
    "-sr",
    "--sphinxroot",
    type=str,
    required=True,
    default="..",
    help="Relative path to the sphinx root.",
)
def main(mdpath, sphinxroot):
    root_path = os.path.abspath(sphinxroot)
    mdpath = os.path.join(root_path, mdpath)
    targetpath = mdpath + "_m2r"

    # clear target directory:
    if os.path.isdir(targetpath):
        shutil.rmtree(targetpath)
    os.mkdir(targetpath)

    mdfiles = [f for f in os.listdir(mdpath) if os.path.splitext(f)[1] == ".md"]
    for f in mdfiles:
        f_rst = parse_from_file(os.path.join(mdpath, f))
        # regex magic- replace invalid links:
        f_rst = fixLinks(f_rst, f, targetpath)
        f_rst = fixTables(f_rst)
        with open(os.path.join(targetpath, f.replace(".md", ".rst")), "w+") as file_:
            file_.write(f_rst)


if __name__ == "__main__":
    main()