332 lines
11 KiB
Python
332 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
import sys
|
|
from dataclasses import dataclass
|
|
import re
|
|
import io
|
|
from typing import TextIO, Union
|
|
|
|
import click
|
|
|
|
|
|
@dataclass
|
|
class Line:
|
|
line_no: str
|
|
is_change: bool
|
|
content: str
|
|
|
|
|
|
@dataclass
|
|
class Outline:
|
|
dest_name: str
|
|
label: str
|
|
subitems: list["Outline"]
|
|
|
|
roman_lookup = []
|
|
roman_lookup_rev = {}
|
|
roman_lookup_re = re.compile("")
|
|
|
|
def build_roman():
|
|
roman_lookup_raw = [
|
|
"/i/ii/iii/iv/v/vi/vii/ix",
|
|
"/x/xx/xxx/xl/l/lx/lxx/xc",
|
|
"/c/cc/ccc/cd/d/dc/dcc/cm",
|
|
"/m/mm/mmm"
|
|
]
|
|
re_parts = []
|
|
for i,pats in enumerate(roman_lookup_raw):
|
|
pats = pats.split('/')
|
|
base = 10**i
|
|
roman_lookup.append((base, pats))
|
|
re_parts.append("(" + "|".join(pats) + ")")
|
|
for j, pat in enumerate(pats):
|
|
roman_lookup_rev[pat] = base * j
|
|
roman_lookup_re = re.compile("^" + "".join(roman_lookup) + "$", re.IGNORECASE)
|
|
|
|
def romanize(n):
|
|
res = []
|
|
for base, pats in roman_lookup:
|
|
res.append(pats[(n // base) % 10])
|
|
return "".join(reversed(res))
|
|
|
|
def unromanize(s):
|
|
m = roman_lookup_re.match(s)
|
|
if m:
|
|
res = 0
|
|
for item in m.groups()[1:]:
|
|
res += int(roman_lookup_rev[item])
|
|
return res
|
|
raise ValueError(f"{s!r} is not a valid roman numeral")
|
|
|
|
ps_str_escapes = re.compile(r'([()\\])')
|
|
def ps_string(s):
|
|
s = ps_str_escapes.sub(r'\\\1', s)
|
|
return f"({s})"
|
|
|
|
|
|
|
|
class Processor:
|
|
line1pat = re.compile(r'^\s*Digital Equipment Corporation\s\s+Confidential And Proprietary\s*$')
|
|
ch_pat = re.compile(r'^\s*((?:\S|\s\S)+)\s\s+Page (\S+)')
|
|
toc_chapter_re = re.compile(r'^\s*(CHAPTER \d+|APPENDIX [A-Z]+|FIGURES|TABLES)(?:\s+((?:\S| \S)+))?\s*$')
|
|
toc_section_re = re.compile(r'^\s*((?:\d+|[A-Z])(?:(?:\.\d+)+|-\d+))\s+((?:\S|\s[^. ])+)\s+\s*(?:\. )*\s*([A-Z]-\d+|\d+-\d+)$')
|
|
bdy_chapter_re = re.compile(r'^\s*(?:CHAPTER|APENDIX)\s*(\d+|[A-Z]+)\s*$')
|
|
bdy_section_re = re.compile(r'^\s*((?:[A-Z]|\d+)(?:\.\d+)+)\s\s((?:\S|\s\S)+)\s*$')
|
|
|
|
# Config settings
|
|
out: TextIO = sys.stdout
|
|
guess_page = True
|
|
ln_cols = 6
|
|
change_col = 8
|
|
include_num = False
|
|
number_color = 0.5
|
|
header_color = 0.5
|
|
marginalia = 11
|
|
header_lines = 4
|
|
trailer_lines = 1
|
|
|
|
fontsize = 10
|
|
linespc = fontsize * 1.2
|
|
|
|
lines: list[Line] = []
|
|
|
|
# running state
|
|
last_chapter = None
|
|
is_toc = False
|
|
last_page = None
|
|
page_n: int = 1
|
|
input_line: int = 0
|
|
pg_start: int = 0
|
|
toc_pfx: str = "sec."
|
|
|
|
chapters: dict[str, str]
|
|
outline: list[Outline]
|
|
|
|
def __init__(self, file: Union[TextIO, str]):
|
|
if isinstance(file, str):
|
|
self.out = open(file, "wt")
|
|
else:
|
|
self.out = file
|
|
|
|
# Emit front-matter
|
|
with open("prelude.ps", "rt") as prelude:
|
|
for line in prelude:
|
|
self.emit(line.strip("\n"))
|
|
|
|
self.outline = []
|
|
self.chapters = {}
|
|
|
|
def do_line(self, line):
|
|
if '\f' in line:
|
|
self.guess_page = False
|
|
segments = line.split('\f')
|
|
for segment in segments[:-1]:
|
|
if segment != "":
|
|
self.real_do_line(segment)
|
|
self.flush_page()
|
|
self.real_do_line(segments[-1])
|
|
else:
|
|
self.real_do_line(line)
|
|
self.input_line += 1
|
|
|
|
def real_do_line(self, line):
|
|
# Expand tabs
|
|
if '\t' in line:
|
|
pad = " " * 8
|
|
segs = []
|
|
for seg in line.split('\t'):
|
|
segs.append(seg)
|
|
segs.append(pad[:8-(len(seg)%8)])
|
|
line = "".join(segs)
|
|
if self.guess_page and self.line1pat.match(line):
|
|
self.flush_page()
|
|
is_change = len(line) > self.change_col and line[self.change_col] != ' '
|
|
if len(line) >= self.marginalia:
|
|
line_no = line[:self.ln_cols]
|
|
content = line[self.marginalia:]
|
|
else:
|
|
line_no = (line + " " * self.ln_cols)[:self.ln_cols]
|
|
content = ""
|
|
self.lines.append(Line(line_no = line_no, content=content, is_change=is_change))
|
|
|
|
def flush_page(self):
|
|
height = len(self.lines)
|
|
width = max(len(line.content) for line in self.lines)
|
|
|
|
last_chapter = self.last_chapter
|
|
chapter_m = self.ch_pat.match(self.lines[2].content)
|
|
if chapter_m is not None:
|
|
self.last_chapter = chapter = chapter_m.group(1)
|
|
page = chapter_m.group(2)
|
|
else:
|
|
print(f"Warning: no page number found on pg {self.pg_start}", file=sys.stderr)
|
|
chapter = last_chapter
|
|
page = self.guess_next_page()
|
|
|
|
self.last_page = page
|
|
|
|
if chapter != last_chapter:
|
|
if chapter == "CONTENTS":
|
|
self.is_toc = True
|
|
else:
|
|
self.is_toc = False
|
|
|
|
lines = list(enumerate(self.lines))
|
|
links = []
|
|
dests = []
|
|
|
|
if self.is_toc:
|
|
# Process TOC entries
|
|
partial_toc = ""
|
|
partial_toc_start = None
|
|
for i, line in lines[self.header_lines:-self.trailer_lines]:
|
|
if line.content.strip() == "":
|
|
partial_toc_start = None
|
|
partial_toc = ""
|
|
continue
|
|
elif line.content.strip() == "CONTENTS":
|
|
dests.append((i, "sec.CONTENTS"))
|
|
self.outline.append(Outline("sec.CONTENTS", "CONTENTS", []))
|
|
continue
|
|
elif m := self.toc_chapter_re.match(line.content):
|
|
label = m.group(1)
|
|
title = m.group(2)
|
|
if ' ' in label:
|
|
num = label.split(' ')[1]
|
|
else:
|
|
num = label
|
|
if label == "FIGURES":
|
|
self.toc_pfx = "sec.FIGURES."
|
|
dests.append((i, "sec.FIGURES"))
|
|
elif label == "TABLES":
|
|
self.toc_pfx = "sec.TABLES."
|
|
dests.append((i, "sec.TABLES"))
|
|
partial_toc_start = None
|
|
partial_toc = ""
|
|
left = len(line.content) - len(line.content.lstrip())
|
|
right = len(line.content.rstrip())
|
|
links.append((left, i, right, i, f"sec.{num}"))
|
|
self.outline.append(Outline(f"sec.{num}", f"{label} - {title}" if title else label, []))
|
|
continue
|
|
|
|
# TODO: This won't work if an entry is split across pages.
|
|
# To fix, partial_toc and partial_toc_start must be saved after/restored before
|
|
# this function
|
|
if partial_toc_start is not None:
|
|
partial_toc += " " + line.content.strip()
|
|
else:
|
|
partial_toc = line.content.rstrip()
|
|
partial_toc_start = i
|
|
|
|
if m := self.toc_section_re.match(partial_toc):
|
|
num = m.group(1)
|
|
title = m.group(2)
|
|
left = partial_toc.index(num[0])
|
|
right = len(line.content.rstrip())
|
|
links.append((left, partial_toc_start, right, i, f"{self.toc_pfx}{num}"))
|
|
self.add_outline(self.toc_pfx + num, title)
|
|
partial_toc = ""
|
|
partial_toc_start = None
|
|
elif partial_toc:
|
|
print(f"No match: {partial_toc!r}", file=sys.stderr)
|
|
else:
|
|
# Process body lines
|
|
partial_toc = ""
|
|
partial_toc_start = None
|
|
for i, line in lines[self.header_lines:-self.trailer_lines]:
|
|
if line.content.strip() == "":
|
|
partial_toc_start = None
|
|
partial_toc = ""
|
|
continue
|
|
elif m := self.bdy_chapter_re.match(line.content):
|
|
num = "sec." + m.group(1)
|
|
dests.append((i, num))
|
|
|
|
if partial_toc_start is not None:
|
|
partial_toc += " " + line.content.strip()
|
|
else:
|
|
partial_toc = line.content.rstrip()
|
|
partial_toc_start = i
|
|
|
|
if m := self.bdy_section_re.match(partial_toc):
|
|
num = m.group(1)
|
|
dests.append((partial_toc_start, "sec." + num))
|
|
|
|
|
|
|
|
|
|
# Begin emitting page
|
|
self.emit(f"%%Page: {ps_string(page)} {self.page_n}")
|
|
self.emit(f"{height} {width} {ps_string(page)} bP")
|
|
|
|
|
|
for i, line in lines[:self.header_lines]:
|
|
self.emit(f"{i} gL {ps_string(line.content)} tH")
|
|
|
|
for i, line in lines[self.header_lines:-self.trailer_lines]:
|
|
chg = " mC" if line.is_change else ""
|
|
self.emit(f"{i} gL {ps_string(line.line_no)} tN {ps_string(line.content)} tB{chg}")
|
|
|
|
for i, line in lines[-self.trailer_lines:]:
|
|
self.emit(f"{i} gL {ps_string(line.content)} tH")
|
|
|
|
for link in links:
|
|
self.emit(f"{link[0]} {link[1]} {link[2]} {link[3]} {ps_string(link[4])} mL")
|
|
for dest in dests:
|
|
self.emit(f"{dest[0]} {ps_string(dest[1])} mD")
|
|
|
|
self.emit("sP")
|
|
|
|
self.lines = []
|
|
self.pg_start = self.input_line
|
|
|
|
def guess_next_page(self):
|
|
if self.last_page is not None:
|
|
if m := re.match(r"^(\d*-)(\d*)$", self.last_page):
|
|
return f"{m.group(1)}{1+int(m.group(2))}"
|
|
else:
|
|
return romanize(1 + unromanize(self.last_page))
|
|
else:
|
|
return "i"
|
|
|
|
def emit(self, s):
|
|
print(s, file=self.out)
|
|
|
|
def emit_trailer(self):
|
|
self.emit("%%Trailer")
|
|
self.emit_outline(self.outline)
|
|
self.emit("%%EOF")
|
|
|
|
def emit_outline(self, outline: list[Outline]):
|
|
for item in outline:
|
|
self.emit(f"{ps_string(item.dest_name)} {ps_string(item.label)} {len(item.subitems)} mO")
|
|
self.emit_outline(item.subitems)
|
|
|
|
def add_outline(self, dest, title):
|
|
entry = Outline(dest, title, [])
|
|
typ = dest.split('.', 1)[0]
|
|
|
|
def recur(pfx, elist: list[Outline]):
|
|
for item in elist:
|
|
if dest.startswith(item.dest_name + "."):
|
|
recur(item.dest_name, item.subitems)
|
|
break
|
|
else:
|
|
if typ == "sec":
|
|
assert dest.rsplit(".", 1)[0] == pfx
|
|
elist.append(entry)
|
|
recur(typ, self.outline)
|
|
|
|
|
|
@click.command()
|
|
@click.option("-o", "--output", type=click.File("w"))
|
|
@click.argument("input", type=click.File("r"), default="-")
|
|
def main(output, input):
|
|
proc = Processor(output or sys.stdout)
|
|
for line in input:
|
|
proc.do_line(line.strip("\n"))
|
|
proc.flush_page()
|
|
proc.emit_trailer()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |