mirror of
https://github.com/rosenpass/rosenpass.git
synced 2025-12-18 13:24:38 +03:00
Some checks failed
rosenpass-ciphers - primitives - benchmark / prim-benchmark (i686-linux) (push) Has been cancelled
rosenpass-ciphers - primitives - benchmark / prim-benchmark (x86_64-linux) (push) Has been cancelled
rosenpass - protocol - benchmark / proto-benchmark (i686-linux) (push) Has been cancelled
rosenpass - protocol - benchmark / proto-benchmark (x86_64-linux) (push) Has been cancelled
rosenpass-ciphers - primitives - benchmark / ciphers-primitives-bench-status (push) Has been cancelled
rosenpass - protocol - benchmark / ciphers-protocol-bench-status (push) Has been cancelled
The LLM-generated Python code showed us that the replacement of aliases, or, as a preparatory step, the tokenization, is something for LARK. Co-authored-by: Anja Rabich <a.rabich@uni-luebeck.de>
266 lines
11 KiB
Python
266 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
|
||
# Below is a **more “Pythonic”** rewrite of the original AWK‑to‑Python translator.
|
||
# The logic is exactly the same – the same error messages, line numbers and exit
|
||
# codes – but the code is organized into small, reusable functions, uses
|
||
# `dataclasses`, type hints, `Path.read_text()`, `re.sub()` and other idiomatic
|
||
# constructs. It is also easier to read and to extend.
|
||
|
||
|
||
"""
|
||
py_awk_translator.py
|
||
|
||
A line‑by‑line pre‑processor that implements the same behaviour as the
|
||
original AWK script you posted (handling @module, @alias, @long‑alias,
|
||
private‑variable expansion, @query/@reachable/@lemma checks and token‑wise
|
||
alias substitution).
|
||
|
||
Usage
|
||
|
||
python3 py_awk_translator.py file1.pv file2.pv
|
||
# or
|
||
cat file.pv | python3 py_awk_translator.py
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import sys
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
from typing import Dict, Iterable
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Helper utilities
|
||
# ----------------------------------------------------------------------
|
||
TOKEN_RE = re.compile(r"[0-9A-Za-z_']")
|
||
|
||
def is_token_char(ch: str) -> bool:
|
||
"""Return True if *ch* can be part of an identifier token."""
|
||
return bool(TOKEN_RE.fullmatch(ch))
|
||
|
||
def die(msg: str, fname: str, lineno: int) -> None:
|
||
"""Print an error to stderr and exit with status 1 (exactly like AWK)."""
|
||
sys.stderr.write(f"{fname}:{lineno}: {msg}\n")
|
||
sys.exit(1)
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Core translator – holds the mutable state that the AWK script kept in
|
||
# global variables.
|
||
# ----------------------------------------------------------------------
|
||
@dataclass
|
||
class Translator:
|
||
"""Collects state while processing a file line‑by‑line."""
|
||
|
||
# final output buffer
|
||
out: list[str] = field(default_factory=list)
|
||
|
||
# current @module name (used when expanding "~")
|
||
module: str = ""
|
||
|
||
# simple one‑line aliases: name → replacement text
|
||
aliases: Dict[str, str] = field(default_factory=dict)
|
||
|
||
# multi‑line alias handling
|
||
long_name: str = ""
|
||
long_value: str = ""
|
||
|
||
# error flag – mirrors the AWK variable `err`
|
||
err: int = 0
|
||
|
||
# ------------------------------------------------------------------
|
||
# Public entry point for a single line
|
||
# ------------------------------------------------------------------
|
||
def process(self, raw: str, fname: str, lineno: int) -> None:
|
||
"""Apply all transformation rules to *raw* and store the result."""
|
||
line = raw.rstrip("\n") # keep a copy for error messages
|
||
original = line # keep the untouched line for later
|
||
|
||
# --------------------------------------------------------------
|
||
# 1️⃣ @module
|
||
# --------------------------------------------------------------
|
||
if line.startswith("@module"):
|
||
parts = line.split(maxsplit=1)
|
||
self.module = parts[1] if len(parts) > 1 else ""
|
||
self.aliases.clear()
|
||
line = ""
|
||
|
||
# --------------------------------------------------------------
|
||
# 2️⃣ @alias
|
||
# --------------------------------------------------------------
|
||
elif line.startswith("@alias"):
|
||
for token in line.split()[1:]:
|
||
if "=" in token:
|
||
name, value = token.split("=", 1)
|
||
self.aliases[name] = value
|
||
line = ""
|
||
|
||
# --------------------------------------------------------------
|
||
# 3️⃣ @long-alias‑end
|
||
# --------------------------------------------------------------
|
||
elif line.startswith("@long-alias-end"):
|
||
if not self.long_name:
|
||
die("Long alias not started", fname, lineno)
|
||
# collapse multiple spaces → single space, strip trailing space
|
||
self.long_value = re.sub(r" +", " ", self.long_value).strip()
|
||
self.aliases[self.long_name] = self.long_value
|
||
self.long_name = self.long_value = ""
|
||
line = ""
|
||
|
||
# --------------------------------------------------------------
|
||
# 4️⃣ @long-alias (start)
|
||
# --------------------------------------------------------------
|
||
elif line.startswith("@long-alias"):
|
||
parts = line.split(maxsplit=1)
|
||
self.long_name = parts[1] if len(parts) > 1 else ""
|
||
self.long_value = ""
|
||
line = ""
|
||
|
||
# --------------------------------------------------------------
|
||
# 5️⃣ PRIVATE__ detection (illegal use of "~")
|
||
# --------------------------------------------------------------
|
||
elif "PRIVATE__" in line:
|
||
die(
|
||
"Used private variable without ~:\n\n"
|
||
f" {lineno} > {original}",
|
||
fname,
|
||
lineno,
|
||
)
|
||
|
||
# --------------------------------------------------------------
|
||
# 6️⃣ @query / @reachable / @lemma validation
|
||
# --------------------------------------------------------------
|
||
elif re.search(r"@(query|reachable|lemma)", line):
|
||
if not re.search(r'@(query|reachable|lemma)\s+"[^"]*"', line):
|
||
die(
|
||
"@query or @reachable statement without parameter:\n\n"
|
||
f" {lineno} > {original}",
|
||
fname,
|
||
lineno,
|
||
)
|
||
# replace the quoted part with blanks (preserve line length)
|
||
m = re.search(r'@(query|reachable|lemma)\s+"[^"]*"', line)
|
||
start, end = m.span()
|
||
line = line[:start] + " " * (end - start) + line[end:]
|
||
|
||
# --------------------------------------------------------------
|
||
# 7️⃣ Expand "~" to the private‑variable prefix
|
||
# --------------------------------------------------------------
|
||
if "~" in line:
|
||
line = line.replace("~", f"PRIVATE__{self.module}__")
|
||
|
||
# --------------------------------------------------------------
|
||
# 8️⃣ Token‑wise alias substitution (the long AWK loop)
|
||
# --------------------------------------------------------------
|
||
line = self._expand_aliases(line)
|
||
|
||
# --------------------------------------------------------------
|
||
# 9️⃣ Accumulate a multi‑line alias, if we are inside one
|
||
# --------------------------------------------------------------
|
||
if self.long_name:
|
||
self.long_value += line + " "
|
||
line = "" # the line itself must not appear in output
|
||
|
||
# --------------------------------------------------------------
|
||
# 🔟 Store the (possibly empty) line for final output
|
||
# --------------------------------------------------------------
|
||
self.out.append(line + "\n")
|
||
|
||
# ------------------------------------------------------------------
|
||
# Helper that implements the token‑wise alias replacement
|
||
# ------------------------------------------------------------------
|
||
def _expand_aliases(self, text: str) -> str:
|
||
"""Replace every whole‑token alias in *text* with its value."""
|
||
i = 0
|
||
result = ""
|
||
|
||
while i < len(text):
|
||
# a = previous char, c = current char
|
||
a = text[i - 1] if i > 0 else ""
|
||
c = text[i]
|
||
|
||
# If we are already inside a token, just move forward
|
||
if i > 0 and is_token_char(a):
|
||
i += 1
|
||
continue
|
||
|
||
# If the current char does not start a token, skip it
|
||
if not is_token_char(c):
|
||
i += 1
|
||
continue
|
||
|
||
# ----------------------------------------------------------
|
||
# At a token boundary – try to match any alias
|
||
# ----------------------------------------------------------
|
||
matched = False
|
||
for name, value in self.aliases.items():
|
||
if text.startswith(name, i):
|
||
after = text[i + len(name) : i + len(name) + 1]
|
||
if is_token_char(after): # name is only a prefix
|
||
continue
|
||
# Alias matches – replace it
|
||
result += text[:i] + value
|
||
text = text[i + len(name) :] # continue scanning the suffix
|
||
i = 0
|
||
matched = True
|
||
break
|
||
|
||
if not matched:
|
||
i += 1
|
||
|
||
return result + text
|
||
|
||
# ------------------------------------------------------------------
|
||
# Finalisation
|
||
# ------------------------------------------------------------------
|
||
def finish(self) -> None:
|
||
"""Write the accumulated output to stdout (unless an error occurred)."""
|
||
if self.err == 0:
|
||
sys.stdout.write("".join(self.out))
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Command‑line driver
|
||
# ----------------------------------------------------------------------
|
||
def _process_path(path: Path, translator: Translator) -> None:
|
||
"""Read *path* line‑by‑line and feed it to *translator*."""
|
||
for lineno, raw in enumerate(path.read_text(encoding="utf-8").splitlines(True), start=1):
|
||
translator.process(raw, str(path), lineno)
|
||
|
||
def main() -> None:
|
||
translator = Translator()
|
||
|
||
# No file arguments → read from stdin (named "<stdin>")
|
||
if len(sys.argv) == 1:
|
||
# stdin may contain multiple lines; we treat it as a single “virtual”
|
||
# file so that line numbers are still correct.
|
||
for lineno, raw in enumerate(sys.stdin, start=1):
|
||
translator.process(raw, "<stdin>", lineno)
|
||
else:
|
||
for name in sys.argv[1:]:
|
||
p = Path(name)
|
||
if not p.is_file():
|
||
sys.stderr.write(f"File not found: {name}\n")
|
||
sys.exit(1)
|
||
_process_path(p, translator)
|
||
|
||
translator.finish()
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|
||
|
||
## What makes this version more Pythonic?
|
||
|
||
# | Aspect | Original style | Refactored style |
|
||
# |--------|----------------|------------------|
|
||
# | **State handling** | Global variables (`buf`, `module`, …) | `@dataclass Translator` encapsulates all mutable state |
|
||
# | **Regular‑expression reuse** | Re‑compiled on every call (`match`, `gsub`) | Compiled once (`TOKEN_RE`) and reused |
|
||
# | **String manipulation** | Manual `substr`, concatenation in loops | Slicing, `str.replace`, `re.sub` for clarity |
|
||
# | **Loop logic** | `for (i=1; i<length($0); i+=1)` with many manual index tricks | A single `while` loop with early‑continue guards; the inner alias‑search is a clean `for name, value in self.aliases.items()` |
|
||
# | **Error handling** | `print(... > "/dev/stderr")` and `exit(1)` | Dedicated `die()` helper that writes to `stderr` and exits |
|
||
# | **File I/O** | Manual `while (getline ...)` in AWK → `for` over `sys.stdin` / `Path.read_text()` | Uses `Path.read_text()` and `enumerate` for line numbers |
|
||
# | **Readability** | Mixed AWK‑style comments, one‑liner `if` statements | Docstrings, section comments, type hints, and small helper methods |
|
||
# | **Extensibility** | Adding a new rule required editing a monolithic block | New rules can be added as separate `elif` blocks or new methods without touching the core loop |
|
||
|
||
# The script can be saved as `py_awk_translator.py`, made executable (`chmod +x py_awk_translator.py`), and used exactly like the original AWK program while being easier to maintain and understand.
|