#!/usr/bin/env python3 import sys from dataclasses import dataclass import re import io from typing import TextIO, Union import click @dataclass class Line: line_no: str is_change: bool content: str @dataclass class Outline: dest_name: str label: str subitems: list["Outline"] roman_lookup = [] roman_lookup_rev = {} roman_lookup_re = re.compile("") def build_roman(): roman_lookup_raw = [ "/i/ii/iii/iv/v/vi/vii/ix", "/x/xx/xxx/xl/l/lx/lxx/xc", "/c/cc/ccc/cd/d/dc/dcc/cm", "/m/mm/mmm" ] re_parts = [] for i,pats in enumerate(roman_lookup_raw): pats = pats.split('/') base = 10**i roman_lookup.append((base, pats)) re_parts.append("(" + "|".join(pats) + ")") for j, pat in enumerate(pats): roman_lookup_rev[pat] = base * j roman_lookup_re = re.compile("^" + "".join(roman_lookup) + "$", re.IGNORECASE) def romanize(n): res = [] for base, pats in roman_lookup: res.append(pats[(n // base) % 10]) return "".join(reversed(res)) def unromanize(s): m = roman_lookup_re.match(s) if m: res = 0 for item in m.groups()[1:]: res += int(roman_lookup_rev[item]) return res raise ValueError(f"{s!r} is not a valid roman numeral") ps_str_escapes = re.compile(r'([()\\])') def ps_string(s): s = ps_str_escapes.sub(r'\\\1', s) return f"({s})" class Processor: line1pat = re.compile(r'^\s*Digital Equipment Corporation\s\s+Confidential And Proprietary\s*$') ch_pat = re.compile(r'^\s*((?:\S|\s\S)+)\s\s+Page (\S+)') toc_chapter_re = re.compile(r'^\s*(CHAPTER \d+|APPENDIX [A-Z]+|FIGURES|TABLES)(?:\s+((?:\S| \S)+))?\s*$') toc_section_re = re.compile(r'^\s*((?:\d+|[A-Z])(?:(?:\.\d+)+|-\d+))\s+((?:\S|\s[^. ])+)\s+\s*(?:\. )*\s*([A-Z]-\d+|\d+-\d+)$') bdy_chapter_re = re.compile(r'^\s*(?:CHAPTER|APENDIX)\s*(\d+|[A-Z]+)\s*$') bdy_section_re = re.compile(r'^\s*((?:[A-Z]|\d+)(?:\.\d+)+)\s\s((?:\S|\s\S)+)\s*$') # Config settings out: TextIO = sys.stdout guess_page = True ln_cols = 6 change_col = 8 include_num = False number_color = 0.5 header_color = 0.5 marginalia = 11 header_lines = 4 trailer_lines = 1 fontsize = 10 linespc = fontsize * 1.2 lines: list[Line] = [] # running state last_chapter = None is_toc = False last_page = None page_n: int = 1 input_line: int = 0 pg_start: int = 0 toc_pfx: str = "sec." chapters: dict[str, str] outline: list[Outline] def __init__(self, file: Union[TextIO, str]): if isinstance(file, str): self.out = open(file, "wt") else: self.out = file # Emit front-matter with open("prelude.ps", "rt") as prelude: for line in prelude: self.emit(line.strip("\n")) self.outline = [] self.chapters = {} def do_line(self, line): if '\f' in line: self.guess_page = False segments = line.split('\f') for segment in segments[:-1]: if segment != "": self.real_do_line(segment) self.flush_page() self.real_do_line(segments[-1]) else: self.real_do_line(line) self.input_line += 1 def real_do_line(self, line): # Expand tabs if '\t' in line: pad = " " * 8 segs = [] for seg in line.split('\t'): segs.append(seg) segs.append(pad[:8-(len(seg)%8)]) line = "".join(segs) if self.guess_page and self.line1pat.match(line): self.flush_page() is_change = len(line) > self.change_col and line[self.change_col] != ' ' if len(line) >= self.marginalia: line_no = line[:self.ln_cols] content = line[self.marginalia:] else: line_no = (line + " " * self.ln_cols)[:self.ln_cols] content = "" self.lines.append(Line(line_no = line_no, content=content, is_change=is_change)) def flush_page(self): height = len(self.lines) width = max(len(line.content) for line in self.lines) last_chapter = self.last_chapter chapter_m = self.ch_pat.match(self.lines[2].content) if chapter_m is not None: self.last_chapter = chapter = chapter_m.group(1) page = chapter_m.group(2) else: print(f"Warning: no page number found on pg {self.pg_start}", file=sys.stderr) chapter = last_chapter page = self.guess_next_page() self.last_page = page if chapter != last_chapter: if chapter == "CONTENTS": self.is_toc = True else: self.is_toc = False lines = list(enumerate(self.lines)) links = [] dests = [] if self.is_toc: # Process TOC entries partial_toc = "" partial_toc_start = None for i, line in lines[self.header_lines:-self.trailer_lines]: if line.content.strip() == "": partial_toc_start = None partial_toc = "" continue elif line.content.strip() == "CONTENTS": dests.append((i, "sec.CONTENTS")) self.outline.append(Outline("sec.CONTENTS", "CONTENTS", [])) continue elif m := self.toc_chapter_re.match(line.content): label = m.group(1) title = m.group(2) if ' ' in label: num = label.split(' ')[1] else: num = label if label == "FIGURES": self.toc_pfx = "sec.FIGURES." dests.append((i, "sec.FIGURES")) elif label == "TABLES": self.toc_pfx = "sec.TABLES." dests.append((i, "sec.TABLES")) partial_toc_start = None partial_toc = "" left = len(line.content) - len(line.content.lstrip()) right = len(line.content.rstrip()) links.append((left, i, right, i, f"sec.{num}")) self.outline.append(Outline(f"sec.{num}", f"{label} - {title}" if title else label, [])) continue # TODO: This won't work if an entry is split across pages. # To fix, partial_toc and partial_toc_start must be saved after/restored before # this function if partial_toc_start is not None: partial_toc += " " + line.content.strip() else: partial_toc = line.content.rstrip() partial_toc_start = i if m := self.toc_section_re.match(partial_toc): num = m.group(1) title = m.group(2) left = partial_toc.index(num[0]) right = len(line.content.rstrip()) links.append((left, partial_toc_start, right, i, f"{self.toc_pfx}{num}")) self.add_outline(self.toc_pfx + num, title) partial_toc = "" partial_toc_start = None elif partial_toc: print(f"No match: {partial_toc!r}", file=sys.stderr) else: # Process body lines partial_toc = "" partial_toc_start = None for i, line in lines[self.header_lines:-self.trailer_lines]: if line.content.strip() == "": partial_toc_start = None partial_toc = "" continue elif m := self.bdy_chapter_re.match(line.content): num = "sec." + m.group(1) dests.append((i, num)) if partial_toc_start is not None: partial_toc += " " + line.content.strip() else: partial_toc = line.content.rstrip() partial_toc_start = i if m := self.bdy_section_re.match(partial_toc): num = m.group(1) dests.append((partial_toc_start, "sec." + num)) # Begin emitting page self.emit(f"%%Page: {ps_string(page)} {self.page_n}") self.emit(f"{height} {width} {ps_string(page)} bP") for i, line in lines[:self.header_lines]: self.emit(f"{i} gL {ps_string(line.content)} tH") for i, line in lines[self.header_lines:-self.trailer_lines]: chg = " mC" if line.is_change else "" self.emit(f"{i} gL {ps_string(line.line_no)} tN {ps_string(line.content)} tB{chg}") for i, line in lines[-self.trailer_lines:]: self.emit(f"{i} gL {ps_string(line.content)} tH") for link in links: self.emit(f"{link[0]} {link[1]} {link[2]} {link[3]} {ps_string(link[4])} mL") for dest in dests: self.emit(f"{dest[0]} {ps_string(dest[1])} mD") self.emit("sP") self.lines = [] self.pg_start = self.input_line def guess_next_page(self): if self.last_page is not None: if m := re.match(r"^(\d*-)(\d*)$", self.last_page): return f"{m.group(1)}{1+int(m.group(2))}" else: return romanize(1 + unromanize(self.last_page)) else: return "i" def emit(self, s): print(s, file=self.out) def emit_trailer(self): self.emit("%%Trailer") self.emit_outline(self.outline) self.emit("%%EOF") def emit_outline(self, outline: list[Outline]): for item in outline: self.emit(f"{ps_string(item.dest_name)} {ps_string(item.label)} {len(item.subitems)} mO") self.emit_outline(item.subitems) def add_outline(self, dest, title): entry = Outline(dest, title, []) typ = dest.split('.', 1)[0] def recur(pfx, elist: list[Outline]): for item in elist: if dest.startswith(item.dest_name + "."): recur(item.dest_name, item.subitems) break else: if typ == "sec": assert dest.rsplit(".", 1)[0] == pfx elist.append(entry) recur(typ, self.outline) @click.command() @click.option("-o", "--output", type=click.File("w")) @click.argument("input", type=click.File("r"), default="-") def main(output, input): proc = Processor(output or sys.stdout) for line in input: proc.do_line(line.strip("\n")) proc.flush_page() proc.emit_trailer() if __name__ == '__main__': main()