dec-doc-proc/docproc.py

#!/usr/bin/env python3
import sys
from dataclasses import dataclass
import re
import io
from typing import TextIO, Union

import click


@dataclass
class Line:
    line_no: str
    is_change: bool
    content: str


@dataclass
class Outline:
    dest_name: str
    label: str
    subitems: list["Outline"]

roman_lookup = []
roman_lookup_rev = {}
roman_lookup_re = re.compile("")

def build_roman():
    roman_lookup_raw = [
        "/i/ii/iii/iv/v/vi/vii/ix",
        "/x/xx/xxx/xl/l/lx/lxx/xc",
        "/c/cc/ccc/cd/d/dc/dcc/cm",
        "/m/mm/mmm"
    ]
    re_parts = []
    for i,pats in enumerate(roman_lookup_raw):
        pats = pats.split('/')
        base = 10**i
        roman_lookup.append((base, pats))
        re_parts.append("(" + "|".join(pats) + ")")
        for j, pat in enumerate(pats):
            roman_lookup_rev[pat] = base * j
    roman_lookup_re = re.compile("^" + "".join(roman_lookup) + "$", re.IGNORECASE)

def romanize(n):
    res = []
    for base, pats in roman_lookup:
        res.append(pats[(n // base) % 10])
    return "".join(reversed(res))

def unromanize(s):
    m = roman_lookup_re.match(s)
    if m:
        res = 0
        for item in m.groups()[1:]:
            res += int(roman_lookup_rev[item])
        return res
    raise ValueError(f"{s!r} is not a valid roman numeral")

ps_str_escapes = re.compile(r'([()\\])')
def ps_string(s):
    s = ps_str_escapes.sub(r'\\\1', s)
    return f"({s})"


class Processor:
    line1pat = re.compile(r'^\s*Digital Equipment Corporation\s\s+Confidential And Proprietary\s*$')
    ch_pat = re.compile(r'^\s*((?:\S|\s\S)+)\s\s+Page (\S+)')
    toc_chapter_re = re.compile(r'^\s*(CHAPTER \d+|APPENDIX [A-Z]+|FIGURES|TABLES)(?:\s+((?:\S| \S)+))?\s*$')
    toc_section_re = re.compile(r'^\s*((?:\d+|[A-Z])(?:(?:\.\d+)+|-\d+))\s+((?:\S|\s[^. ])+)\s+\s*(?:\. )*\s*([A-Z]-\d+|\d+-\d+)$')
    bdy_chapter_re = re.compile(r'^\s*(?:CHAPTER|APENDIX)\s*(\d+|[A-Z]+)\s*$')
    bdy_section_re = re.compile(r'^\s*((?:[A-Z]|\d+)(?:\.\d+)+)\s\s((?:\S|\s\S)+)\s*$')

    # Config settings
    out: TextIO = sys.stdout
    guess_page = True
    ln_cols = 6
    change_col = 8
    include_num = False
    number_color = 0.5
    header_color = 0.5
    marginalia = 11
    header_lines = 4
    trailer_lines = 1

    fontsize = 10
    linespc = fontsize * 1.2

    lines: list[Line] = []

    # running state
    last_chapter = None
    is_toc = False
    last_page = None
    page_n: int = 1
    input_line: int = 0
    pg_start: int = 0
    toc_pfx: str = "sec."

    chapters: dict[str, str]
    outline: list[Outline]

    def __init__(self, file: Union[TextIO, str]):
        if isinstance(file, str):
            self.out = open(file, "wt")
        else:
            self.out = file

        # Emit front-matter
        with open("prelude.ps", "rt") as prelude:
            for line in prelude:
                self.emit(line.strip("\n"))

        self.outline = []
        self.chapters = {}

    def do_line(self, line):
        if '\f' in line:
            self.guess_page = False
            segments = line.split('\f')
            for segment in segments[:-1]:
                if segment != "":
                    self.real_do_line(segment)
                self.flush_page()
            self.real_do_line(segments[-1])
        else:
            self.real_do_line(line)
        self.input_line += 1

    def real_do_line(self, line):
        # Expand tabs
        if '\t' in line:
            pad = " " * 8
            segs = []
            for seg in line.split('\t'):
                segs.append(seg)
                segs.append(pad[:8-(len(seg)%8)])
            line = "".join(segs)
        if self.guess_page and self.line1pat.match(line):
            self.flush_page()
        is_change = len(line) > self.change_col and line[self.change_col] != ' '
        if len(line) >= self.marginalia:
            line_no = line[:self.ln_cols]
            content = line[self.marginalia:]
        else:
            line_no = (line + " " * self.ln_cols)[:self.ln_cols]
            content = ""
        self.lines.append(Line(line_no = line_no, content=content, is_change=is_change))

    def flush_page(self):
        height = len(self.lines)
        width = max(len(line.content) for line in self.lines)

        last_chapter = self.last_chapter
        chapter_m = self.ch_pat.match(self.lines[2].content)
        if chapter_m is not None:
            self.last_chapter = chapter = chapter_m.group(1)
            page = chapter_m.group(2)
        else:
            print(f"Warning: no page number found on pg {self.pg_start}", file=sys.stderr)
            chapter = last_chapter
            page = self.guess_next_page()

        self.last_page = page

        if chapter != last_chapter:
            if chapter == "CONTENTS":
                self.is_toc = True
            else:
                self.is_toc = False

        lines = list(enumerate(self.lines))
        links = []
        dests = []

        if self.is_toc:
            # Process TOC entries
            partial_toc = ""
            partial_toc_start = None
            for i, line in lines[self.header_lines:-self.trailer_lines]:
                if line.content.strip() == "":
                    partial_toc_start = None
                    partial_toc = ""
                    continue
                elif line.content.strip() == "CONTENTS":
                    dests.append((i, "sec.CONTENTS"))
                    self.outline.append(Outline("sec.CONTENTS", "CONTENTS", []))
                    continue
                elif m := self.toc_chapter_re.match(line.content):
                    label = m.group(1)
                    title = m.group(2)
                    if ' ' in label:
                        num = label.split(' ')[1]
                    else:
                        num = label
                        if label == "FIGURES":
                            self.toc_pfx = "sec.FIGURES."
                            dests.append((i, "sec.FIGURES"))
                        elif label == "TABLES":
                            self.toc_pfx = "sec.TABLES."
                            dests.append((i, "sec.TABLES"))
                    partial_toc_start = None
                    partial_toc = ""
                    left = len(line.content) - len(line.content.lstrip())
                    right = len(line.content.rstrip())
                    links.append((left, i, right, i, f"sec.{num}"))
                    self.outline.append(Outline(f"sec.{num}", f"{label} - {title}" if title else label, []))
                    continue

                # TODO: This won't work if an entry is split across pages.
                # To fix, partial_toc and partial_toc_start must be saved after/restored before
                # this function
                if partial_toc_start is not None:
                    partial_toc += " " + line.content.strip()
                else:
                    partial_toc = line.content.rstrip()
                    partial_toc_start = i

                if m := self.toc_section_re.match(partial_toc):
                    num = m.group(1)
                    title = m.group(2)
                    left = partial_toc.index(num[0])
                    right = len(line.content.rstrip())
                    links.append((left, partial_toc_start, right, i, f"{self.toc_pfx}{num}"))
                    self.add_outline(self.toc_pfx + num, title)
                    partial_toc = ""
                    partial_toc_start = None
                elif partial_toc:
                    print(f"No match: {partial_toc!r}", file=sys.stderr)
        else:
            # Process body lines
            partial_toc = ""
            partial_toc_start = None
            for i, line in lines[self.header_lines:-self.trailer_lines]:
                if line.content.strip() == "":
                    partial_toc_start = None
                    partial_toc = ""
                    continue
                elif m := self.bdy_chapter_re.match(line.content):
                    num = "sec." + m.group(1)
                    dests.append((i, num))

                if partial_toc_start is not None:
                    partial_toc += " " + line.content.strip()
                else:
                    partial_toc = line.content.rstrip()
                    partial_toc_start = i

                if m := self.bdy_section_re.match(partial_toc):
                    num = m.group(1)
                    dests.append((partial_toc_start, "sec." + num))


        # Begin emitting page
        self.emit(f"%%Page: {ps_string(page)} {self.page_n}")
        self.emit(f"{height} {width} {ps_string(page)} bP")


        for i, line in lines[:self.header_lines]:
            self.emit(f"{i} gL {ps_string(line.content)} tH")

        for i, line in lines[self.header_lines:-self.trailer_lines]:
            chg = " mC" if  line.is_change else ""
            self.emit(f"{i} gL {ps_string(line.line_no)} tN {ps_string(line.content)} tB{chg}")

        for i, line in lines[-self.trailer_lines:]:
            self.emit(f"{i} gL {ps_string(line.content)} tH")

        for link in links:
            self.emit(f"{link[0]} {link[1]} {link[2]} {link[3]} {ps_string(link[4])} mL")
        for dest in dests:
            self.emit(f"{dest[0]} {ps_string(dest[1])} mD")

        self.emit("sP")

        self.lines = []
        self.pg_start = self.input_line

    def guess_next_page(self):
        if self.last_page is not None:
            if m := re.match(r"^(\d*-)(\d*)$", self.last_page):
                return f"{m.group(1)}{1+int(m.group(2))}"
            else:
                return romanize(1 + unromanize(self.last_page))
        else:
            return "i"

    def emit(self, s):
        print(s, file=self.out)

    def emit_trailer(self):
        self.emit("%%Trailer")
        self.emit_outline(self.outline)
        self.emit("%%EOF")

    def emit_outline(self, outline: list[Outline]):
        for item in outline:
            self.emit(f"{ps_string(item.dest_name)} {ps_string(item.label)} {len(item.subitems)} mO")
            self.emit_outline(item.subitems)

    def add_outline(self, dest, title):
        entry = Outline(dest, title, [])
        typ = dest.split('.', 1)[0]

        def recur(pfx, elist: list[Outline]):
            for item in elist:
                if dest.startswith(item.dest_name + "."):
                    recur(item.dest_name, item.subitems)
                    break
            else:
                if typ == "sec":
                    assert dest.rsplit(".", 1)[0] == pfx
                elist.append(entry)
        recur(typ, self.outline)


@click.command()
@click.option("-o", "--output", type=click.File("w"))
@click.argument("input", type=click.File("r"), default="-")
def main(output, input):
    proc = Processor(output or sys.stdout)
    for line in input:
        proc.do_line(line.strip("\n"))
    proc.flush_page()
    proc.emit_trailer()


if __name__ == '__main__':
    main()