Initial commit

2025-01-26 03:02:54 +01:00
commit fa7499b001
6 changed files with 39305 additions and 0 deletions
--- a/DroidSansMonoSlashed.ttf
+++ b/DroidSansMonoSlashed.ttf
--- a/README.adoc
+++ b/README.adoc
@@ -0,0 +1,26 @@
+= DEC documentation processor
+
+== Intro
+This converts DEC-derived .txt files from whatever tool they used to generate documents to PDF including 
+a table of contents (which is handy with 400-odd page documents)
+
+The code is terrible, and a non-trivial amount of the heavy lifting (including
+all the page layout) is done in PostScript, but it works for the Mass
+Storage Control Protocol documents from bitsavers, so I'm happy.
+
+== Usage
+
+....
+python3 docproc.py -o mscp.ps mscp.txt
+ps2pdf mscp.ps mscp.pdf
+....
+
+In order to change the font (which you probably will need to do), edit `prelude.ps` and look for the line that has `findfont` on it.
+
+If you want the line numbers included, change `/zDN false def` to `/zDN true def` in `prelude.ps`. Depending on the font you're using, you may or may not also need to change the font size (`/zCH`)
+
+== Disclaimer
+This was a weekend hack, so don't expect much readability.
+If it breaks, good luck with that. If you fix it, send a patch.
+
+It also doesn't do quite as much as you might want: inline section references are ignored, tables aren't linked (though that)
--- a/docproc.py
+++ b/docproc.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+import sys
+from dataclasses import dataclass
+import re
+import io
+from typing import TextIO, Union
+
+import click
+
+
+@dataclass
+class Line:
+    line_no: str
+    is_change: bool
+    content: str
+
+
+@dataclass
+class Outline:
+    dest_name: str
+    label: str
+    subitems: list["Outline"]
+
+roman_lookup = []
+roman_lookup_rev = {}
+roman_lookup_re = re.compile("")
+
+def build_roman():
+    roman_lookup_raw = [
+        "/i/ii/iii/iv/v/vi/vii/ix",
+        "/x/xx/xxx/xl/l/lx/lxx/xc",
+        "/c/cc/ccc/cd/d/dc/dcc/cm",
+        "/m/mm/mmm"
+    ]
+    re_parts = []
+    for i,pats in enumerate(roman_lookup_raw):
+        pats = pats.split('/')
+        base = 10**i
+        roman_lookup.append((base, pats))
+        re_parts.append("(" + "|".join(pats) + ")")
+        for j, pat in enumerate(pats):
+            roman_lookup_rev[pat] = base * j
+    roman_lookup_re = re.compile("^" + "".join(roman_lookup) + "$", re.IGNORECASE)
+
+def romanize(n):
+    res = []
+    for base, pats in roman_lookup:
+        res.append(pats[(n // base) % 10])
+    return "".join(reversed(res))
+
+def unromanize(s):
+    m = roman_lookup_re.match(s)
+    if m:
+        res = 0
+        for item in m.groups()[1:]:
+            res += int(roman_lookup_rev[item])
+        return res
+    raise ValueError(f"{s!r} is not a valid roman numeral")
+
+ps_str_escapes = re.compile(r'([()\\])')
+def ps_string(s):
+    s = ps_str_escapes.sub(r'\\\1', s)
+    return f"({s})"
+
+
+
+class Processor:
+    line1pat = re.compile(r'^\s*Digital Equipment Corporation\s\s+Confidential And Proprietary\s*$')
+    ch_pat = re.compile(r'^\s*((?:\S|\s\S)+)\s\s+Page (\S+)')
+    toc_chapter_re = re.compile(r'^\s*(CHAPTER \d+|APPENDIX [A-Z]+|FIGURES|TABLES)(?:\s+((?:\S| \S)+))?\s*$')
+    toc_section_re = re.compile(r'^\s*((?:\d+|[A-Z])(?:(?:\.\d+)+|-\d+))\s+((?:\S|\s[^. ])+)\s+\s*(?:\. )*\s*([A-Z]-\d+|\d+-\d+)$')
+    bdy_chapter_re = re.compile(r'^\s*(?:CHAPTER|APENDIX)\s*(\d+|[A-Z]+)\s*$')
+    bdy_section_re = re.compile(r'^\s*((?:[A-Z]|\d+)(?:\.\d+)+)\s\s((?:\S|\s\S)+)\s*$')
+
+    # Config settings
+    out: TextIO = sys.stdout
+    guess_page = True
+    ln_cols = 6
+    change_col = 8
+    include_num = False
+    number_color = 0.5
+    header_color = 0.5
+    marginalia = 11
+    header_lines = 4
+    trailer_lines = 1
+
+    fontsize = 10
+    linespc = fontsize * 1.2
+
+    lines: list[Line] = []
+
+    # running state
+    last_chapter = None
+    is_toc = False
+    last_page = None
+    page_n: int = 1
+    input_line: int = 0
+    pg_start: int = 0
+    toc_pfx: str = "sec."
+
+    chapters: dict[str, str]
+    outline: list[Outline]
+
+    def __init__(self, file: Union[TextIO, str]):
+        if isinstance(file, str):
+            self.out = open(file, "wt")
+        else:
+            self.out = file
+
+        # Emit front-matter
+        with open("prelude.ps", "rt") as prelude:
+            for line in prelude:
+                self.emit(line.strip("\n"))
+
+        self.outline = []
+        self.chapters = {}
+
+    def do_line(self, line):
+        if '\f' in line:
+            self.guess_page = False
+            segments = line.split('\f')
+            for segment in segments[:-1]:
+                if segment != "":
+                    self.real_do_line(segment)
+                self.flush_page()
+            self.real_do_line(segments[-1])
+        else:
+            self.real_do_line(line)
+        self.input_line += 1
+
+    def real_do_line(self, line):
+        # Expand tabs
+        if '\t' in line:
+            pad = " " * 8
+            segs = []
+            for seg in line.split('\t'):
+                segs.append(seg)
+                segs.append(pad[:8-(len(seg)%8)])
+            line = "".join(segs)
+        if self.guess_page and self.line1pat.match(line):
+            self.flush_page()
+        is_change = len(line) > self.change_col and line[self.change_col] != ' '
+        if len(line) >= self.marginalia:
+            line_no = line[:self.ln_cols]
+            content = line[self.marginalia:]
+        else:
+            line_no = (line + " " * self.ln_cols)[:self.ln_cols]
+            content = ""
+        self.lines.append(Line(line_no = line_no, content=content, is_change=is_change))
+
+    def flush_page(self):
+        height = len(self.lines)
+        width = max(len(line.content) for line in self.lines)
+
+        last_chapter = self.last_chapter
+        chapter_m = self.ch_pat.match(self.lines[2].content)
+        if chapter_m is not None:
+            self.last_chapter = chapter = chapter_m.group(1)
+            page = chapter_m.group(2)
+        else:
+            print(f"Warning: no page number found on pg {self.pg_start}", file=sys.stderr)
+            chapter = last_chapter
+            page = self.guess_next_page()
+
+        self.last_page = page
+
+        if chapter != last_chapter:
+            if chapter == "CONTENTS":
+                self.is_toc = True
+            else:
+                self.is_toc = False
+
+        lines = list(enumerate(self.lines))
+        links = []
+        dests = []
+
+        if self.is_toc:
+            # Process TOC entries
+            partial_toc = ""
+            partial_toc_start = None
+            for i, line in lines[self.header_lines:-self.trailer_lines]:
+                if line.content.strip() == "":
+                    partial_toc_start = None
+                    partial_toc = ""
+                    continue
+                elif line.content.strip() == "CONTENTS":
+                    dests.append((i, "sec.CONTENTS"))
+                    self.outline.append(Outline("sec.CONTENTS", "CONTENTS", []))
+                    continue
+                elif m := self.toc_chapter_re.match(line.content):
+                    label = m.group(1)
+                    title = m.group(2)
+                    if ' ' in label:
+                        num = label.split(' ')[1]
+                    else:
+                        num = label
+                        if label == "FIGURES":
+                            self.toc_pfx = "sec.FIGURES."
+                            dests.append((i, "sec.FIGURES"))
+                        elif label == "TABLES":
+                            self.toc_pfx = "sec.TABLES."
+                            dests.append((i, "sec.TABLES"))
+                    partial_toc_start = None
+                    partial_toc = ""
+                    left = len(line.content) - len(line.content.lstrip())
+                    right = len(line.content.rstrip())
+                    links.append((left, i, right, i, f"sec.{num}"))
+                    self.outline.append(Outline(f"sec.{num}", f"{label} - {title}" if title else label, []))
+                    continue
+
+                # TODO: This won't work if an entry is split across pages.
+                # To fix, partial_toc and partial_toc_start must be saved after/restored before
+                # this function
+                if partial_toc_start is not None:
+                    partial_toc += " " + line.content.strip()
+                else:
+                    partial_toc = line.content.rstrip()
+                    partial_toc_start = i
+
+                if m := self.toc_section_re.match(partial_toc):
+                    num = m.group(1)
+                    title = m.group(2)
+                    left = partial_toc.index(num[0])
+                    right = len(line.content.rstrip())
+                    links.append((left, partial_toc_start, right, i, f"{self.toc_pfx}{num}"))
+                    self.add_outline(self.toc_pfx + num, title)
+                    partial_toc = ""
+                    partial_toc_start = None
+                elif partial_toc:
+                    print(f"No match: {partial_toc!r}", file=sys.stderr)
+        else:
+            # Process body lines
+            partial_toc = ""
+            partial_toc_start = None
+            for i, line in lines[self.header_lines:-self.trailer_lines]:
+                if line.content.strip() == "":
+                    partial_toc_start = None
+                    partial_toc = ""
+                    continue
+                elif m := self.bdy_chapter_re.match(line.content):
+                    num = "sec." + m.group(1)
+                    dests.append((i, num))
+
+                if partial_toc_start is not None:
+                    partial_toc += " " + line.content.strip()
+                else:
+                    partial_toc = line.content.rstrip()
+                    partial_toc_start = i
+
+                if m := self.bdy_section_re.match(partial_toc):
+                    num = m.group(1)
+                    dests.append((partial_toc_start, "sec." + num))
+
+
+
+
+        # Begin emitting page
+        self.emit(f"%%Page: {ps_string(page)} {self.page_n}")
+        self.emit(f"{height} {width} {ps_string(page)} bP")
+
+
+        for i, line in lines[:self.header_lines]:
+            self.emit(f"{i} gL {ps_string(line.content)} tH")
+
+        for i, line in lines[self.header_lines:-self.trailer_lines]:
+            chg = " mC" if  line.is_change else ""
+            self.emit(f"{i} gL {ps_string(line.line_no)} tN {ps_string(line.content)} tB{chg}")
+
+        for i, line in lines[-self.trailer_lines:]:
+            self.emit(f"{i} gL {ps_string(line.content)} tH")
+
+        for link in links:
+            self.emit(f"{link[0]} {link[1]} {link[2]} {link[3]} {ps_string(link[4])} mL")
+        for dest in dests:
+            self.emit(f"{dest[0]} {ps_string(dest[1])} mD")
+
+        self.emit("sP")
+
+        self.lines = []
+        self.pg_start = self.input_line
+
+    def guess_next_page(self):
+        if self.last_page is not None:
+            if m := re.match(r"^(\d*-)(\d*)$", self.last_page):
+                return f"{m.group(1)}{1+int(m.group(2))}"
+            else:
+                return romanize(1 + unromanize(self.last_page))
+        else:
+            return "i"
+
+    def emit(self, s):
+        print(s, file=self.out)
+
+    def emit_trailer(self):
+        self.emit("%%Trailer")
+        self.emit_outline(self.outline)
+        self.emit("%%EOF")
+
+    def emit_outline(self, outline: list[Outline]):
+        for item in outline:
+            self.emit(f"{ps_string(item.dest_name)} {ps_string(item.label)} {len(item.subitems)} mO")
+            self.emit_outline(item.subitems)
+
+    def add_outline(self, dest, title):
+        entry = Outline(dest, title, [])
+        typ = dest.split('.', 1)[0]
+
+        def recur(pfx, elist: list[Outline]):
+            for item in elist:
+                if dest.startswith(item.dest_name + "."):
+                    recur(item.dest_name, item.subitems)
+                    break
+            else:
+                if typ == "sec":
+                    assert dest.rsplit(".", 1)[0] == pfx
+                elist.append(entry)
+        recur(typ, self.outline)
+
+
+@click.command()
+@click.option("-o", "--output", type=click.File("w"))
+@click.argument("input", type=click.File("r"), default="-")
+def main(output, input):
+    proc = Processor(output or sys.stdout)
+    for line in input:
+        proc.do_line(line.strip("\n"))
+    proc.flush_page()
+    proc.emit_trailer()
+
+
+if __name__ == '__main__':
+    main()
--- a/prelude.ps
+++ b/prelude.ps
@@ -0,0 +1,126 @@
+%!PS-Adobe-3.0
+%%Creator: Docproc.py
+%%Orientation: Portrait
+%%DocumentMedia: Letter 612 792 90 white ( )
+%%BeginDefaults
+%%PageMedia: Letter
+%%EndDefaults
+
+%%BeginProlog
+/zDN false def
+/zNW zDN { 6 } { 0 } ifelse def
+/bP { % begin Page
+	% height width pageName --
+	% height and width are in characters
+	
+	mark exch /Label exch /PAGELABEL pdfmark
+	dup
+	zPW exch zNW add zCW mul sub 2 div zNW zCW mul add /zX0 exch def
+	1 add zCW mul zX0 add /zXC exch def
+	zPH exch 
+	% -- zPH nlines 
+	  1 sub zLS mul zCH add 
+	% -- zPH zTH
+	add 2 div zCH sub /zY0 exch def
+} bind def
+/gL {
+	zLS mul neg zY0 add /zY1 exch def
+} bind def % go line
+/tH {
+	zX0 zY1 moveto
+	0.5 setgray
+	show
+} bind def % text Header
+/tN zDN {{
+	zX0 zNW 1 add zCW mul sub zY1 moveto
+	0.7 setgray
+	show
+}} {{ }} ifelse bind def % text lineNo
+/tB {
+	zX0 zY1 moveto
+	0 setgray
+	show
+} bind def % text body
+/mC {
+
+	0.75 0 0 setrgbcolor
+	newpath
+	1 setlinewidth
+	zXC zY1 -0.2 zLS mul add moveto
+	0 zLS rlineto
+	stroke
+} bind def % mark Change
+/fC {
+	0.2 add zLS mul neg zY0 add exch
+	zCW mul zX0 add exch
+} bind def % from Charpos
+/mL { % -- left top right bottom name
+	5 dict begin
+		/_N exch def
+		/_B exch def
+		/_R exch def
+		/_T exch def
+		/_L exch def
+		mark
+		/Rect [
+			_L _B fC
+			_R _T 1 sub fC
+		]
+		/Border [ 0 0 0.5 ]
+		/C [ 0 0 1 ]
+		/Subtype /Link
+		/Dest _N cvn
+		/ANN
+		pdfmark
+
+%		0 1 0 setrgbcolor
+%		1 setlinewidth
+%		newpath
+%
+%		_L _T 1 sub fC moveto
+%		_R _T 1 sub fC lineto
+%		_R _B fC lineto
+%		_L _B fC lineto
+%		closepath
+%		stroke
+	end
+} bind def % mark Link
+/mD { % -- top name
+	2 dict begin
+		/_N exch def
+		/_T exch 0 exch 1 sub fC exch pop def
+		mark 
+		/Dest _N cvn
+		/View [ /XYZ null _T null ]
+		/DEST pdfmark
+	end
+} bind def % mark Destination
+/mO { % -- dest title subitems
+	3 dict begin
+		/_S exch def
+		/_T exch def
+		/_D exch def
+		mark
+		/Title _T 
+		_S 0 gt { 
+			/Count _S
+		} if
+		/Dest _D cvn
+		/OUT pdfmark
+	end
+} bind def
+/sP { showpage } bind def
+
+%%EndProlog
+
+%%BeginSetup
+/zCH 10 def % char height
+/zLS zCH 1.2 mul def % line spacing
+(DroidSansMonoSlashed) findfont zCH scalefont setfont
+
+% Globals
+/zCW (M) stringwidth pop def % char width
+/zPH { currentpagedevice /PageSize get 1 get } def % page height
+/zPW { currentpagedevice /PageSize get 0 get } def % page width
+%%EndSetup
+
--- a/sources/mscp.txt
+++ b/sources/mscp.txt
--- a/sources/tmscp.txt
+++ b/sources/tmscp.txt