Initial commit

This commit is contained in:
2025-01-26 03:02:54 +01:00
commit fa7499b001
6 changed files with 39305 additions and 0 deletions

BIN
DroidSansMonoSlashed.ttf Normal file

Binary file not shown.

26
README.adoc Normal file
View File

@@ -0,0 +1,26 @@
= DEC documentation processor
== Intro
This converts DEC-derived .txt files from whatever tool they used to generate documents to PDF including
a table of contents (which is handy with 400-odd page documents)
The code is terrible, and a non-trivial amount of the heavy lifting (including
all the page layout) is done in PostScript, but it works for the Mass
Storage Control Protocol documents from bitsavers, so I'm happy.
== Usage
....
python3 docproc.py -o mscp.ps mscp.txt
ps2pdf mscp.ps mscp.pdf
....
In order to change the font (which you probably will need to do), edit `prelude.ps` and look for the line that has `findfont` on it.
If you want the line numbers included, change `/zDN false def` to `/zDN true def` in `prelude.ps`. Depending on the font you're using, you may or may not also need to change the font size (`/zCH`)
== Disclaimer
This was a weekend hack, so don't expect much readability.
If it breaks, good luck with that. If you fix it, send a patch.
It also doesn't do quite as much as you might want: inline section references are ignored, tables aren't linked (though that)

332
docproc.py Normal file
View File

@@ -0,0 +1,332 @@
#!/usr/bin/env python3
import sys
from dataclasses import dataclass
import re
import io
from typing import TextIO, Union
import click
@dataclass
class Line:
line_no: str
is_change: bool
content: str
@dataclass
class Outline:
dest_name: str
label: str
subitems: list["Outline"]
roman_lookup = []
roman_lookup_rev = {}
roman_lookup_re = re.compile("")
def build_roman():
roman_lookup_raw = [
"/i/ii/iii/iv/v/vi/vii/ix",
"/x/xx/xxx/xl/l/lx/lxx/xc",
"/c/cc/ccc/cd/d/dc/dcc/cm",
"/m/mm/mmm"
]
re_parts = []
for i,pats in enumerate(roman_lookup_raw):
pats = pats.split('/')
base = 10**i
roman_lookup.append((base, pats))
re_parts.append("(" + "|".join(pats) + ")")
for j, pat in enumerate(pats):
roman_lookup_rev[pat] = base * j
roman_lookup_re = re.compile("^" + "".join(roman_lookup) + "$", re.IGNORECASE)
def romanize(n):
res = []
for base, pats in roman_lookup:
res.append(pats[(n // base) % 10])
return "".join(reversed(res))
def unromanize(s):
m = roman_lookup_re.match(s)
if m:
res = 0
for item in m.groups()[1:]:
res += int(roman_lookup_rev[item])
return res
raise ValueError(f"{s!r} is not a valid roman numeral")
ps_str_escapes = re.compile(r'([()\\])')
def ps_string(s):
s = ps_str_escapes.sub(r'\\\1', s)
return f"({s})"
class Processor:
line1pat = re.compile(r'^\s*Digital Equipment Corporation\s\s+Confidential And Proprietary\s*$')
ch_pat = re.compile(r'^\s*((?:\S|\s\S)+)\s\s+Page (\S+)')
toc_chapter_re = re.compile(r'^\s*(CHAPTER \d+|APPENDIX [A-Z]+|FIGURES|TABLES)(?:\s+((?:\S| \S)+))?\s*$')
toc_section_re = re.compile(r'^\s*((?:\d+|[A-Z])(?:(?:\.\d+)+|-\d+))\s+((?:\S|\s[^. ])+)\s+\s*(?:\. )*\s*([A-Z]-\d+|\d+-\d+)$')
bdy_chapter_re = re.compile(r'^\s*(?:CHAPTER|APENDIX)\s*(\d+|[A-Z]+)\s*$')
bdy_section_re = re.compile(r'^\s*((?:[A-Z]|\d+)(?:\.\d+)+)\s\s((?:\S|\s\S)+)\s*$')
# Config settings
out: TextIO = sys.stdout
guess_page = True
ln_cols = 6
change_col = 8
include_num = False
number_color = 0.5
header_color = 0.5
marginalia = 11
header_lines = 4
trailer_lines = 1
fontsize = 10
linespc = fontsize * 1.2
lines: list[Line] = []
# running state
last_chapter = None
is_toc = False
last_page = None
page_n: int = 1
input_line: int = 0
pg_start: int = 0
toc_pfx: str = "sec."
chapters: dict[str, str]
outline: list[Outline]
def __init__(self, file: Union[TextIO, str]):
if isinstance(file, str):
self.out = open(file, "wt")
else:
self.out = file
# Emit front-matter
with open("prelude.ps", "rt") as prelude:
for line in prelude:
self.emit(line.strip("\n"))
self.outline = []
self.chapters = {}
def do_line(self, line):
if '\f' in line:
self.guess_page = False
segments = line.split('\f')
for segment in segments[:-1]:
if segment != "":
self.real_do_line(segment)
self.flush_page()
self.real_do_line(segments[-1])
else:
self.real_do_line(line)
self.input_line += 1
def real_do_line(self, line):
# Expand tabs
if '\t' in line:
pad = " " * 8
segs = []
for seg in line.split('\t'):
segs.append(seg)
segs.append(pad[:8-(len(seg)%8)])
line = "".join(segs)
if self.guess_page and self.line1pat.match(line):
self.flush_page()
is_change = len(line) > self.change_col and line[self.change_col] != ' '
if len(line) >= self.marginalia:
line_no = line[:self.ln_cols]
content = line[self.marginalia:]
else:
line_no = (line + " " * self.ln_cols)[:self.ln_cols]
content = ""
self.lines.append(Line(line_no = line_no, content=content, is_change=is_change))
def flush_page(self):
height = len(self.lines)
width = max(len(line.content) for line in self.lines)
last_chapter = self.last_chapter
chapter_m = self.ch_pat.match(self.lines[2].content)
if chapter_m is not None:
self.last_chapter = chapter = chapter_m.group(1)
page = chapter_m.group(2)
else:
print(f"Warning: no page number found on pg {self.pg_start}", file=sys.stderr)
chapter = last_chapter
page = self.guess_next_page()
self.last_page = page
if chapter != last_chapter:
if chapter == "CONTENTS":
self.is_toc = True
else:
self.is_toc = False
lines = list(enumerate(self.lines))
links = []
dests = []
if self.is_toc:
# Process TOC entries
partial_toc = ""
partial_toc_start = None
for i, line in lines[self.header_lines:-self.trailer_lines]:
if line.content.strip() == "":
partial_toc_start = None
partial_toc = ""
continue
elif line.content.strip() == "CONTENTS":
dests.append((i, "sec.CONTENTS"))
self.outline.append(Outline("sec.CONTENTS", "CONTENTS", []))
continue
elif m := self.toc_chapter_re.match(line.content):
label = m.group(1)
title = m.group(2)
if ' ' in label:
num = label.split(' ')[1]
else:
num = label
if label == "FIGURES":
self.toc_pfx = "sec.FIGURES."
dests.append((i, "sec.FIGURES"))
elif label == "TABLES":
self.toc_pfx = "sec.TABLES."
dests.append((i, "sec.TABLES"))
partial_toc_start = None
partial_toc = ""
left = len(line.content) - len(line.content.lstrip())
right = len(line.content.rstrip())
links.append((left, i, right, i, f"sec.{num}"))
self.outline.append(Outline(f"sec.{num}", f"{label} - {title}" if title else label, []))
continue
# TODO: This won't work if an entry is split across pages.
# To fix, partial_toc and partial_toc_start must be saved after/restored before
# this function
if partial_toc_start is not None:
partial_toc += " " + line.content.strip()
else:
partial_toc = line.content.rstrip()
partial_toc_start = i
if m := self.toc_section_re.match(partial_toc):
num = m.group(1)
title = m.group(2)
left = partial_toc.index(num[0])
right = len(line.content.rstrip())
links.append((left, partial_toc_start, right, i, f"{self.toc_pfx}{num}"))
self.add_outline(self.toc_pfx + num, title)
partial_toc = ""
partial_toc_start = None
elif partial_toc:
print(f"No match: {partial_toc!r}", file=sys.stderr)
else:
# Process body lines
partial_toc = ""
partial_toc_start = None
for i, line in lines[self.header_lines:-self.trailer_lines]:
if line.content.strip() == "":
partial_toc_start = None
partial_toc = ""
continue
elif m := self.bdy_chapter_re.match(line.content):
num = "sec." + m.group(1)
dests.append((i, num))
if partial_toc_start is not None:
partial_toc += " " + line.content.strip()
else:
partial_toc = line.content.rstrip()
partial_toc_start = i
if m := self.bdy_section_re.match(partial_toc):
num = m.group(1)
dests.append((partial_toc_start, "sec." + num))
# Begin emitting page
self.emit(f"%%Page: {ps_string(page)} {self.page_n}")
self.emit(f"{height} {width} {ps_string(page)} bP")
for i, line in lines[:self.header_lines]:
self.emit(f"{i} gL {ps_string(line.content)} tH")
for i, line in lines[self.header_lines:-self.trailer_lines]:
chg = " mC" if line.is_change else ""
self.emit(f"{i} gL {ps_string(line.line_no)} tN {ps_string(line.content)} tB{chg}")
for i, line in lines[-self.trailer_lines:]:
self.emit(f"{i} gL {ps_string(line.content)} tH")
for link in links:
self.emit(f"{link[0]} {link[1]} {link[2]} {link[3]} {ps_string(link[4])} mL")
for dest in dests:
self.emit(f"{dest[0]} {ps_string(dest[1])} mD")
self.emit("sP")
self.lines = []
self.pg_start = self.input_line
def guess_next_page(self):
if self.last_page is not None:
if m := re.match(r"^(\d*-)(\d*)$", self.last_page):
return f"{m.group(1)}{1+int(m.group(2))}"
else:
return romanize(1 + unromanize(self.last_page))
else:
return "i"
def emit(self, s):
print(s, file=self.out)
def emit_trailer(self):
self.emit("%%Trailer")
self.emit_outline(self.outline)
self.emit("%%EOF")
def emit_outline(self, outline: list[Outline]):
for item in outline:
self.emit(f"{ps_string(item.dest_name)} {ps_string(item.label)} {len(item.subitems)} mO")
self.emit_outline(item.subitems)
def add_outline(self, dest, title):
entry = Outline(dest, title, [])
typ = dest.split('.', 1)[0]
def recur(pfx, elist: list[Outline]):
for item in elist:
if dest.startswith(item.dest_name + "."):
recur(item.dest_name, item.subitems)
break
else:
if typ == "sec":
assert dest.rsplit(".", 1)[0] == pfx
elist.append(entry)
recur(typ, self.outline)
@click.command()
@click.option("-o", "--output", type=click.File("w"))
@click.argument("input", type=click.File("r"), default="-")
def main(output, input):
proc = Processor(output or sys.stdout)
for line in input:
proc.do_line(line.strip("\n"))
proc.flush_page()
proc.emit_trailer()
if __name__ == '__main__':
main()

126
prelude.ps Normal file
View File

@@ -0,0 +1,126 @@
%!PS-Adobe-3.0
%%Creator: Docproc.py
%%Orientation: Portrait
%%DocumentMedia: Letter 612 792 90 white ( )
%%BeginDefaults
%%PageMedia: Letter
%%EndDefaults
%%BeginProlog
/zDN false def
/zNW zDN { 6 } { 0 } ifelse def
/bP { % begin Page
% height width pageName --
% height and width are in characters
mark exch /Label exch /PAGELABEL pdfmark
dup
zPW exch zNW add zCW mul sub 2 div zNW zCW mul add /zX0 exch def
1 add zCW mul zX0 add /zXC exch def
zPH exch
% -- zPH nlines
1 sub zLS mul zCH add
% -- zPH zTH
add 2 div zCH sub /zY0 exch def
} bind def
/gL {
zLS mul neg zY0 add /zY1 exch def
} bind def % go line
/tH {
zX0 zY1 moveto
0.5 setgray
show
} bind def % text Header
/tN zDN {{
zX0 zNW 1 add zCW mul sub zY1 moveto
0.7 setgray
show
}} {{ }} ifelse bind def % text lineNo
/tB {
zX0 zY1 moveto
0 setgray
show
} bind def % text body
/mC {
0.75 0 0 setrgbcolor
newpath
1 setlinewidth
zXC zY1 -0.2 zLS mul add moveto
0 zLS rlineto
stroke
} bind def % mark Change
/fC {
0.2 add zLS mul neg zY0 add exch
zCW mul zX0 add exch
} bind def % from Charpos
/mL { % -- left top right bottom name
5 dict begin
/_N exch def
/_B exch def
/_R exch def
/_T exch def
/_L exch def
mark
/Rect [
_L _B fC
_R _T 1 sub fC
]
/Border [ 0 0 0.5 ]
/C [ 0 0 1 ]
/Subtype /Link
/Dest _N cvn
/ANN
pdfmark
% 0 1 0 setrgbcolor
% 1 setlinewidth
% newpath
%
% _L _T 1 sub fC moveto
% _R _T 1 sub fC lineto
% _R _B fC lineto
% _L _B fC lineto
% closepath
% stroke
end
} bind def % mark Link
/mD { % -- top name
2 dict begin
/_N exch def
/_T exch 0 exch 1 sub fC exch pop def
mark
/Dest _N cvn
/View [ /XYZ null _T null ]
/DEST pdfmark
end
} bind def % mark Destination
/mO { % -- dest title subitems
3 dict begin
/_S exch def
/_T exch def
/_D exch def
mark
/Title _T
_S 0 gt {
/Count _S
} if
/Dest _D cvn
/OUT pdfmark
end
} bind def
/sP { showpage } bind def
%%EndProlog
%%BeginSetup
/zCH 10 def % char height
/zLS zCH 1.2 mul def % line spacing
(DroidSansMonoSlashed) findfont zCH scalefont setfont
% Globals
/zCW (M) stringwidth pop def % char width
/zPH { currentpagedevice /PageSize get 1 get } def % page height
/zPW { currentpagedevice /PageSize get 0 get } def % page width
%%EndSetup

29145
sources/mscp.txt Normal file

File diff suppressed because it is too large Load Diff

9676
sources/tmscp.txt Normal file

File diff suppressed because it is too large Load Diff