transpiler_pro
Location: src/transpiler_pro/__init__.py
Description: Transpiler-Pro - Enterprise Documentation Pipeline.
This package provides a comprehensive engine for converting Markdown to AsciiDoc while enforcing linguistic standards and branding consistency.
Core Features:
- DocConverter: Shield-Transpile-Restore logic using Pandoc.
- StyleLinter: Orchestration of Vale-based style validation.
- StyleFixer: NLP-driven 'Auto-Heal' for linguistic repair.
The pipeline is entirely data-driven, utilizing settings defined in pyproject.toml.
1""" 2Location: src/transpiler_pro/__init__.py 3 4Description: Transpiler-Pro - Enterprise Documentation Pipeline. 5 6This package provides a comprehensive engine for converting Markdown to 7AsciiDoc while enforcing linguistic standards and branding consistency. 8 9Core Features: 10 111. **DocConverter**: Shield-Transpile-Restore logic using Pandoc. 122. **StyleLinter**: Orchestration of Vale-based style validation. 133. **StyleFixer**: NLP-driven 'Auto-Heal' for linguistic repair. 14 15The pipeline is entirely data-driven, utilizing settings defined in `pyproject.toml`. 16""" 17 18from .core.converter import DocConverter 19from .core.linter import StyleLinter 20from .core.fixer import StyleFixer 21 22__version__ = "1.0.0" 23__author__ = "Sushant Gaurav" 24 25 26def get_info() -> str: 27 """ 28 Returns the basic identity string for the package. 29 30 Returns: 31 str: A formatted string containing the tool name, version, and purpose. 32 """ 33 return f"Transpiler-Pro v{__version__} - Enterprise Documentation Engine" 34 35 36# Defines the public API exposed at the top level of the package 37__all__ = [ 38 "DocConverter", 39 "StyleLinter", 40 "StyleFixer", 41 "__version__", 42 "get_info" 43]
22class DocConverter: 23 """ 24 A pattern-driven engine that transforms Markdown into Enterprise AsciiDoc. 25 26 Attributes: 27 config (Dict): Configuration extracted from pyproject.toml. 28 metadata (Dict): Extracted frontmatter (YAML) from the source file. 29 discovered_title (str): The inferred document title (H1 or YAML). 30 """ 31 32 def __init__(self, config_path: Optional[Path] = None): 33 """Initializes the converter and loads conversion patterns.""" 34 self.config_path = config_path or Path("pyproject.toml") 35 self.config = self._load_project_config() 36 self.conv_cfg = self.config.get("conversions", {}) 37 self.metadata: Dict[str, Any] = {} 38 self.discovered_title = None 39 self.used_ids: Set[str] = set() 40 self.protected_json: List[str] = [] 41 # --- SUSE Branding Attribute Map --- 42 # Format: "Raw Text to Find": "{attribute-variable-name}" 43 # ORDER MATTERS: Longest strings first to prevent partial matching. 44 self.attribute_map = { 45 "SUSE® Rancher Prime: Admission Policy Manager": "{kubewarden-product-name}", 46 "SUSE® Rancher Prime: Continuous Delivery": "{fleet-product-name}", 47 "SUSE® Rancher Prime: OS Manager": "{elemental-product-name}", 48 "SUSE® Rancher Prime: Cluster API": "{turtles-product-name}", 49 "SUSE® Rancher Prime: K3s": "{k3s-product-name}", 50 "SUSE® Rancher Prime: RKE2": "{rke2-product-name}", 51 "SUSE® Rancher Prime": "{rancher-product-name-tm}", 52 "SUSE Rancher Prime": "{rancher-product-name}", 53 "SUSE® Virtualization": "{harvester-product-name-tm}", 54 "SUSE Virtualization": "{harvester-product-name}", 55 "SUSE® Storage": "{longhorn-product-name-tm}", 56 "SUSE Storage": "{longhorn-product-name}", 57 "SUSE® Security": "{neuvector-product-name}", 58 "SUSE® Losant": "{losant-product-name}", 59 "SUSE Losant": "{losant-product-name}" 60 } 61 62 def _load_project_config(self) -> Dict[str, Any]: 63 """Loads the [tool.transpiler-pro] configuration block. 64 65 Args: 66 config_path (Path): Path to the pyproject.toml file. 67 68 Returns: 69 Dict: The configuration dictionary for transpiler-pro. 70 """ 71 if not self.config_path.exists(): 72 return {} 73 try: 74 with open(self.config_path, "rb") as f: 75 import tomllib 76 return tomllib.load(f).get("tool", {}).get("transpiler-pro", {}) 77 except Exception: 78 return {} 79 80 def _apply_global_attributes(self, text: str) -> str: 81 """ 82 Replaces raw product names with Antora attributes. 83 Uses negative lookbehind/lookahead to protect URLs and file paths. 84 85 Args: 86 text (str): The input string to process. 87 88 Returns: 89 str: The text with product names replaced by attributes. 90 """ 91 # Guard clause for empty text to avoid unnecessary processing. 92 if not text: 93 return text 94 95 # We iterate through the attribute map and apply replacements. 96 # The regex ensures we only replace standalone occurrences of the product names, not when they are part of URLs or file paths. 97 for raw_name, attr in self.attribute_map.items(): 98 # Refined Regex: Protects URLs (/:) but allows trailing periods in sentences 99 pattern = rf"(?<![/:])\b{re.escape(raw_name)}\b" 100 text = re.sub(pattern, attr, text) 101 102 return text 103 104 def _slugify(self, text: str) -> str: 105 """ 106 Converts a heading title into a SEO-friendly, unique ID. 107 Example: "Access Keys & Security" -> "access-keys-security" 108 109 Args: 110 text (str): The raw heading text. 111 112 Returns: 113 str: A slugified version suitable for use as an anchor ID. 114 """ 115 # 1. Lowercase and strip technical syntax and HTML/JSX tags 116 slug = text.lower() 117 slug = re.sub(r'<[^>]+>', '', slug) # Remove HTML tags 118 slug = re.sub(r'\{#.*?\}', '', slug) # Remove existing MD IDs 119 slug = re.sub(r'[^a-z0-9\s-]', '', slug) # Remove special chars 120 121 # 2. Replace spaces/multiple dashes/underscores with a single dash 122 slug = re.sub(r'[\s_/-]+', '-', slug).strip('-') 123 124 # 3. Handle uniqueness within the document (Collision Avoidance) 125 base_slug = slug or "section" 126 final_slug = base_slug 127 counter = 1 128 while final_slug in self.used_ids: 129 final_slug = f"{base_slug}-{counter}" 130 counter += 1 131 132 self.used_ids.add(final_slug) 133 return final_slug 134 135 def pre_process_markdown(self, content: str) -> str: 136 """ 137 Prepares Markdown for Pandoc by shielding modern syntax and extracting metadata. 138 139 Args: 140 content (str): Raw Markdown string. 141 142 Returns: 143 str: "Shielded" Markdown ready for Pandoc. 144 """ 145 self.metadata = {} 146 self.discovered_title = None 147 self.used_ids = set() 148 # Initialize storage for JSON components to protect them from Pandoc 149 self.protected_json = [] 150 151 # --- 1. CODE BLOCK SHIELDING --- 152 # We protect '#' characters inside code blocks so the Title Scavenger 153 # doesn't accidentally treat a code comment as the document's H1 title. 154 content = re.sub(r'(`{3}.*?`{3})', lambda m: m.group(1).replace('#', 'HASHSHIELD'), content, flags=re.DOTALL) 155 156 # --- 2. FRONTMATTER EXTRACTION --- 157 # Extracts YAML metadata (title, description, etc.) from the top of the MD file. 158 frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL) 159 if frontmatter_match: 160 try: 161 yaml_data = yaml.safe_load(frontmatter_match.group(1)) 162 if isinstance(yaml_data, dict): 163 self.metadata = yaml_data 164 content = content[frontmatter_match.end():] 165 except Exception: 166 self.metadata = {} 167 168 # --- 3. TITLE SCAVENGER --- 169 # Logic: Priority 1 is YAML 'title'. Priority 2 is the first H1 (#) found. 170 self.discovered_title = self.metadata.get('title') 171 if not self.discovered_title: 172 h1_match = re.search(r'^#\s+(.*)$', content, re.M) 173 if h1_match: 174 self.discovered_title = h1_match.group(1).strip() 175 # Remove the H1 from body as it will be promoted to the AsciiDoc Document Title (=). 176 content = content.replace(h1_match.group(0), "", 1) 177 178 # Restore shielded hashes after title scavenging is safe. 179 content = content.replace('HASHSHIELD', '#') 180 181 # --- 4. VIDEO & COMPLEX PATTERN SHIELDING --- 182 # Replaces complex HTML/Markdown blocks with tokens that Pandoc will ignore. 183 # This protects <iframe> embeds and custom ':::tabs' blocks. 184 content = re.sub(r'<iframe.*?embed/([^"?\s]+).*?</iframe>', r'VIDEOTOKEN\1', content) 185 186 patterns = self.conv_cfg.get("shielding_patterns", []) 187 for p in patterns: 188 regex = p.get("regex") 189 replacement = p.get("replacement") 190 191 if p.get("hook") == "protect_spaces": 192 # Special hook for collapsibles to ensure spaces in titles aren't lost. 193 def protect_hook(match: Match) -> str: 194 title = match.group(1).strip().replace(' ', 'PROTECTSPACE') 195 body = match.group(2).strip() 196 return replacement.replace(r"\1", title).replace(r"\2", body) 197 content = re.sub(regex, protect_hook, content, flags=re.S) 198 else: 199 content = re.sub(regex, replacement, content, flags=re.S) 200 201 # --- JSON COMPONENT SHIELDING --- 202 # Protects <JsonDisplay /> from being mangled into latexmath/footnotes by Pandoc 203 def shield_json_display(match): 204 # Pure alphanumeric placeholder to avoid Pandoc escaping 205 placeholder = f"JSONP{len(self.protected_json)}PROTECT" 206 self.protected_json.append(match.group(1)) 207 return placeholder 208 209 content = re.sub(r'(<JsonDisplay.*?\/>)', shield_json_display, content, flags=re.DOTALL) 210 211 # Protect existing Markdown IDs so Pandoc doesn't mangle curly braces 212 content = re.sub(r'\{#(.*?)\}', r'IDSHIELDSTART\1IDSHIELDEND', content) 213 214 return content 215 216 def post_process_asciidoc(self, content: str) -> str: 217 """ 218 Finalizes the AsciiDoc output after Pandoc has finished. 219 220 Order of Operations: 221 1. Reset ID tracker and prioritize H1 Document Title. 222 2. Process H2-H6 headings with collision avoidance. 223 3. Apply Global Branding Attributes to body prose. 224 4. Construct the Metadata Header. 225 5. Restore shielded blocks and clean Pandoc noise. 226 6. Apply Antora-specific normalization (Xrefs & Image Scaling). 227 228 Args: 229 content (str): The raw AsciiDoc output from Pandoc. 230 231 Returns: 232 str: The finalized AsciiDoc content ready for Antora. 233 """ 234 # --- 1. INITIALIZATION & H1 PRIORITY --- 235 self.used_ids = set() 236 today = datetime.now().strftime("%Y-%m-%d") 237 238 # A. Create the ID Slug from the RAW title BEFORE branding 239 # This prevents ID leakage (e.g., [#suse-storage] instead of [#{longhorn-product-name}]) 240 title_slug = self._slugify(self.discovered_title or "untitled") 241 242 # B. Prepare the title for DISPLAY only 243 title_text = self.discovered_title or "Untitled Document" 244 title_text = self._apply_global_attributes(title_text) 245 246 # --- 2. HEADING SLUGGING & NORMALIZATION --- 247 def heading_anchor_logic(match): 248 level_chars = match.group(1) 249 raw_title = match.group(2).strip() 250 251 # Check for shielded custom IDs from Markdown (e.g., {#my-custom-id}) 252 custom_id_match = re.search(r'IDSHIELDSTART(.*?)IDSHIELDEND', raw_title) 253 254 if custom_id_match: 255 base_id = custom_id_match.group(1) 256 display_title = raw_title.replace(custom_id_match.group(0), "").strip() 257 258 # Collision avoidance even for custom IDs 259 final_id = base_id 260 counter = 1 261 while final_id in self.used_ids: 262 final_id = f"{base_id}-{counter}" 263 counter += 1 264 self.used_ids.add(final_id) 265 # Apply branding to display title after ID is locked 266 display_title = self._apply_global_attributes(display_title) 267 else: 268 # Regular heading: slugify RAW title first for clean SEO 269 final_id = self._slugify(raw_title) 270 # Then brand the display title for the reader 271 display_title = self._apply_global_attributes(raw_title) 272 273 # We return the heading with an explicit anchor to ensure URL stability, even if the title text changes in the future. 274 return f"\n[#{final_id}]\n{level_chars} {display_title}" 275 276 # Transform H2-H6 levels (Pandoc's == syntax) 277 content = re.sub(r'\n(={2,6})\s+(.*)', heading_anchor_logic, content) 278 279 # Heading cleanup 280 content = re.sub(r'IDSHIELDSTART.*?IDSHIELDEND', '', content) 281 content = re.sub(r'^={6,}\s+', r'===== ', content, flags=re.M) 282 283 # C. Apply Global Attributes to the body content AFTER headings are locked 284 content = self._apply_global_attributes(content) 285 286 # --- 3. CONSTRUCT HEADER BLOCK --- 287 header_lines = [ 288 f"[#{title_slug}]", 289 f"= {title_text}", 290 ":idprefix:", 291 ":idseparator: -" 292 ] 293 294 # Inject YAML metadata 295 for key, value in self.metadata.items(): 296 if key.lower() != "title": 297 header_lines.append(f":{key}: {value}") 298 299 header_lines.append(f":revdate: {today}") 300 301 # Add global Antora headers from config 302 antora_cfg = self.config.get("antora", {}) 303 header_lines.extend(antora_cfg.get("headers", [])) 304 header_block = "\n".join(header_lines) + "\n\n" 305 306 # --- 4. MARKER RESTORATION & CLEANUP --- 307 # Restore JSON Components 308 if hasattr(self, 'protected_json'): 309 for i, original in enumerate(self.protected_json): 310 content = content.replace(f"JSONP{i}PROTECT", original) 311 312 # Clean Pandoc artifacts 313 content = content.replace("++_++", "_").replace("++{++", "{").replace("++}++", "}") 314 content = content.replace("++{{++", "{{").replace("++}}++", "}}") 315 content = content.replace("++<++", "<").replace("++>++", ">") 316 content = content.replace("++*++", "*").replace("++_++", "_") 317 318 # Restore video embeds 319 content = re.sub(r'VIDEOTOKEN([a-zA-Z0-9_-]+)', r'video::\1[youtube]', content) 320 content = content.replace('’', "'").replace('‘', "'").replace('“', '"').replace('”', '"').replace('…', '...') 321 322 # Apply cleanup regex from config 323 cleanup = self.conv_cfg.get("cleanup_regex", []) 324 for c in cleanup: 325 flags = re.M if c.get("flags") == "M" else 0 326 regex = c.get("regex") 327 replacement = c.get("replacement") 328 329 if c.get("hook") == "uppercase_label": 330 def uppercase_hook(m: Match) -> str: 331 return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n====" 332 content = re.sub(regex, uppercase_hook, content, flags=flags) 333 else: 334 content = re.sub(regex, replacement, content, flags=flags) 335 336 # --- 5. ADMONITION PROMOTION --- 337 def promote_admo(m: Match) -> str: 338 return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n====" 339 340 content = re.sub(r'(?i)^\*?(Note|Warning|Tip|Caution|Important|IMPORTANT)[:]?\*?[:]?\s+(.*)$', promote_admo, content, flags=re.M) 341 342 # --- 6. DYNAMIC RESTORATIONS --- 343 restorations = self.conv_cfg.get("restoration_patterns", []) 344 for r in restorations: 345 regex, replacement = r.get("regex"), r.get("replacement") 346 if r.get("hook") == "restore_spaces": 347 def restore_hook(m: Match) -> str: 348 full_block = m.group(1) 349 parts = full_block.split("SHIELDSEP", 1) if "SHIELDSEP" in full_block else full_block.split("\n", 1) 350 title = parts[0].replace('PROTECTSPACE', ' ').strip() 351 body = parts[1].strip() if len(parts) > 1 else "" 352 return f".{title}\n[%collapsible]\n======\n{body}\n======" 353 content = re.sub(regex, restore_hook, content, flags=re.S) 354 else: 355 mapping = r.get("map") 356 if mapping: 357 for key, val in mapping.items(): 358 content = re.sub(regex.replace("{key}", key), replacement.replace("{val}", val), content, flags=re.S) 359 else: 360 content = re.sub(regex, replacement, content, flags=re.S) 361 362 # --- 7. ANTORA NORMALIZATION (Xrefs & Images) --- 363 # Image path cleanup: Strip /images/ and ensure double colon '::' for block images 364 content = re.sub(r'image:/?images/(.*?)\[', r'image::\1[', content) 365 366 # Ensure scaling is applied even if the path was already clean 367 content = re.sub(r'image::([^\[]+)\[\]', r'image::\1[pdfwidth=100%,scalewidth=100%]', content) 368 369 # Antora Xref Normalization: Convert Pandoc's link/xref syntax into clean Antora xrefs. 370 def antora_xref_logic(m: Match) -> str: 371 raw_path = m.group(1) or "" 372 anchor = m.group(2) or "" 373 path = raw_path.replace(".md", "").replace(".adoc", "").replace("./", "").strip("/") 374 if "inputs/" in path: 375 path = path.split("inputs/")[-1] 376 if path: 377 path = f"{path}.adoc" 378 cl_anchor = "" 379 if anchor: 380 cl_anchor = "#" + anchor.replace("#", "").lower() 381 return f"xref:{path}{cl_anchor}" 382 383 content = re.sub(r'(?:link:|xref:)(?:\+\+)?([^\[\s#\+]+)?(#[^\[\s\+]+)?(?:\+\+)?', antora_xref_logic, content) 384 385 # --- 8. FINAL CLEANUP --- 386 # Ensure a blank line before lists (both * and .) to prevent squashing 387 # Matches a non-newline character followed by a single newline and a list marker 388 # content = re.sub(r'([^\n])\n([*.])\s', r'\1\n\n\2 ', content) 389 390 content = re.sub(r'\[source,mermaid\]\n----(.*?)----', r'[mermaid]\n....\1....', content, flags=re.DOTALL) 391 content = content.replace("SHIELDADMONSTARTtabs", "[tabs]\n====") 392 content = content.replace("SHIELDADMONEND", "====") 393 content = re.sub(r'^@tab\s+(.*)$', r'\1::', content, flags=re.M) 394 395 return header_block + content.strip() 396 397 def convert_file(self, input_path: Path, output_path: Path) -> None: 398 """ 399 Orchestrates the conversion of a single Markdown file to AsciiDoc. 400 401 Args: 402 input_path (Path): Source Markdown file. 403 output_path (Path): Destination for the raw AsciiDoc. 404 405 Returns: 406 None: Writes the converted content to output_path. 407 """ 408 self.metadata = {} 409 self.discovered_title = None 410 411 raw_md = input_path.read_text(encoding='utf-8') 412 ready_md = self.pre_process_markdown(raw_md) 413 414 # We write to a temporary file so Pandoc sees the 'shielded' version 415 temp_md = input_path.with_suffix('.tmp.md') 416 temp_md.write_text(ready_md, encoding='utf-8') 417 418 try: 419 # Execute Pandoc CLI 420 subprocess.run( 421 [ 422 "pandoc", 423 "-f", "markdown-smart", 424 "-t", "asciidoc", 425 "--shift-heading-level-by=-1", 426 "--wrap=none", 427 "-o", str(output_path), 428 str(temp_md) 429 ], 430 check=True, 431 capture_output=True 432 ) 433 434 # Post-process the Pandoc result to restore shields and finalize headers 435 final_adoc = self.post_process_asciidoc(output_path.read_text(encoding='utf-8')) 436 output_path.write_text(final_adoc, encoding='utf-8') 437 finally: 438 # Tidy up transient files 439 if temp_md.exists(): 440 temp_md.unlink()
A pattern-driven engine that transforms Markdown into Enterprise AsciiDoc.
Attributes:
- config (Dict): Configuration extracted from pyproject.toml.
- metadata (Dict): Extracted frontmatter (YAML) from the source file.
- discovered_title (str): The inferred document title (H1 or YAML).
32 def __init__(self, config_path: Optional[Path] = None): 33 """Initializes the converter and loads conversion patterns.""" 34 self.config_path = config_path or Path("pyproject.toml") 35 self.config = self._load_project_config() 36 self.conv_cfg = self.config.get("conversions", {}) 37 self.metadata: Dict[str, Any] = {} 38 self.discovered_title = None 39 self.used_ids: Set[str] = set() 40 self.protected_json: List[str] = [] 41 # --- SUSE Branding Attribute Map --- 42 # Format: "Raw Text to Find": "{attribute-variable-name}" 43 # ORDER MATTERS: Longest strings first to prevent partial matching. 44 self.attribute_map = { 45 "SUSE® Rancher Prime: Admission Policy Manager": "{kubewarden-product-name}", 46 "SUSE® Rancher Prime: Continuous Delivery": "{fleet-product-name}", 47 "SUSE® Rancher Prime: OS Manager": "{elemental-product-name}", 48 "SUSE® Rancher Prime: Cluster API": "{turtles-product-name}", 49 "SUSE® Rancher Prime: K3s": "{k3s-product-name}", 50 "SUSE® Rancher Prime: RKE2": "{rke2-product-name}", 51 "SUSE® Rancher Prime": "{rancher-product-name-tm}", 52 "SUSE Rancher Prime": "{rancher-product-name}", 53 "SUSE® Virtualization": "{harvester-product-name-tm}", 54 "SUSE Virtualization": "{harvester-product-name}", 55 "SUSE® Storage": "{longhorn-product-name-tm}", 56 "SUSE Storage": "{longhorn-product-name}", 57 "SUSE® Security": "{neuvector-product-name}", 58 "SUSE® Losant": "{losant-product-name}", 59 "SUSE Losant": "{losant-product-name}" 60 }
Initializes the converter and loads conversion patterns.
135 def pre_process_markdown(self, content: str) -> str: 136 """ 137 Prepares Markdown for Pandoc by shielding modern syntax and extracting metadata. 138 139 Args: 140 content (str): Raw Markdown string. 141 142 Returns: 143 str: "Shielded" Markdown ready for Pandoc. 144 """ 145 self.metadata = {} 146 self.discovered_title = None 147 self.used_ids = set() 148 # Initialize storage for JSON components to protect them from Pandoc 149 self.protected_json = [] 150 151 # --- 1. CODE BLOCK SHIELDING --- 152 # We protect '#' characters inside code blocks so the Title Scavenger 153 # doesn't accidentally treat a code comment as the document's H1 title. 154 content = re.sub(r'(`{3}.*?`{3})', lambda m: m.group(1).replace('#', 'HASHSHIELD'), content, flags=re.DOTALL) 155 156 # --- 2. FRONTMATTER EXTRACTION --- 157 # Extracts YAML metadata (title, description, etc.) from the top of the MD file. 158 frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL) 159 if frontmatter_match: 160 try: 161 yaml_data = yaml.safe_load(frontmatter_match.group(1)) 162 if isinstance(yaml_data, dict): 163 self.metadata = yaml_data 164 content = content[frontmatter_match.end():] 165 except Exception: 166 self.metadata = {} 167 168 # --- 3. TITLE SCAVENGER --- 169 # Logic: Priority 1 is YAML 'title'. Priority 2 is the first H1 (#) found. 170 self.discovered_title = self.metadata.get('title') 171 if not self.discovered_title: 172 h1_match = re.search(r'^#\s+(.*)$', content, re.M) 173 if h1_match: 174 self.discovered_title = h1_match.group(1).strip() 175 # Remove the H1 from body as it will be promoted to the AsciiDoc Document Title (=). 176 content = content.replace(h1_match.group(0), "", 1) 177 178 # Restore shielded hashes after title scavenging is safe. 179 content = content.replace('HASHSHIELD', '#') 180 181 # --- 4. VIDEO & COMPLEX PATTERN SHIELDING --- 182 # Replaces complex HTML/Markdown blocks with tokens that Pandoc will ignore. 183 # This protects <iframe> embeds and custom ':::tabs' blocks. 184 content = re.sub(r'<iframe.*?embed/([^"?\s]+).*?</iframe>', r'VIDEOTOKEN\1', content) 185 186 patterns = self.conv_cfg.get("shielding_patterns", []) 187 for p in patterns: 188 regex = p.get("regex") 189 replacement = p.get("replacement") 190 191 if p.get("hook") == "protect_spaces": 192 # Special hook for collapsibles to ensure spaces in titles aren't lost. 193 def protect_hook(match: Match) -> str: 194 title = match.group(1).strip().replace(' ', 'PROTECTSPACE') 195 body = match.group(2).strip() 196 return replacement.replace(r"\1", title).replace(r"\2", body) 197 content = re.sub(regex, protect_hook, content, flags=re.S) 198 else: 199 content = re.sub(regex, replacement, content, flags=re.S) 200 201 # --- JSON COMPONENT SHIELDING --- 202 # Protects <JsonDisplay /> from being mangled into latexmath/footnotes by Pandoc 203 def shield_json_display(match): 204 # Pure alphanumeric placeholder to avoid Pandoc escaping 205 placeholder = f"JSONP{len(self.protected_json)}PROTECT" 206 self.protected_json.append(match.group(1)) 207 return placeholder 208 209 content = re.sub(r'(<JsonDisplay.*?\/>)', shield_json_display, content, flags=re.DOTALL) 210 211 # Protect existing Markdown IDs so Pandoc doesn't mangle curly braces 212 content = re.sub(r'\{#(.*?)\}', r'IDSHIELDSTART\1IDSHIELDEND', content) 213 214 return content
Prepares Markdown for Pandoc by shielding modern syntax and extracting metadata.
Arguments:
- content (str): Raw Markdown string.
Returns:
str: "Shielded" Markdown ready for Pandoc.
216 def post_process_asciidoc(self, content: str) -> str: 217 """ 218 Finalizes the AsciiDoc output after Pandoc has finished. 219 220 Order of Operations: 221 1. Reset ID tracker and prioritize H1 Document Title. 222 2. Process H2-H6 headings with collision avoidance. 223 3. Apply Global Branding Attributes to body prose. 224 4. Construct the Metadata Header. 225 5. Restore shielded blocks and clean Pandoc noise. 226 6. Apply Antora-specific normalization (Xrefs & Image Scaling). 227 228 Args: 229 content (str): The raw AsciiDoc output from Pandoc. 230 231 Returns: 232 str: The finalized AsciiDoc content ready for Antora. 233 """ 234 # --- 1. INITIALIZATION & H1 PRIORITY --- 235 self.used_ids = set() 236 today = datetime.now().strftime("%Y-%m-%d") 237 238 # A. Create the ID Slug from the RAW title BEFORE branding 239 # This prevents ID leakage (e.g., [#suse-storage] instead of [#{longhorn-product-name}]) 240 title_slug = self._slugify(self.discovered_title or "untitled") 241 242 # B. Prepare the title for DISPLAY only 243 title_text = self.discovered_title or "Untitled Document" 244 title_text = self._apply_global_attributes(title_text) 245 246 # --- 2. HEADING SLUGGING & NORMALIZATION --- 247 def heading_anchor_logic(match): 248 level_chars = match.group(1) 249 raw_title = match.group(2).strip() 250 251 # Check for shielded custom IDs from Markdown (e.g., {#my-custom-id}) 252 custom_id_match = re.search(r'IDSHIELDSTART(.*?)IDSHIELDEND', raw_title) 253 254 if custom_id_match: 255 base_id = custom_id_match.group(1) 256 display_title = raw_title.replace(custom_id_match.group(0), "").strip() 257 258 # Collision avoidance even for custom IDs 259 final_id = base_id 260 counter = 1 261 while final_id in self.used_ids: 262 final_id = f"{base_id}-{counter}" 263 counter += 1 264 self.used_ids.add(final_id) 265 # Apply branding to display title after ID is locked 266 display_title = self._apply_global_attributes(display_title) 267 else: 268 # Regular heading: slugify RAW title first for clean SEO 269 final_id = self._slugify(raw_title) 270 # Then brand the display title for the reader 271 display_title = self._apply_global_attributes(raw_title) 272 273 # We return the heading with an explicit anchor to ensure URL stability, even if the title text changes in the future. 274 return f"\n[#{final_id}]\n{level_chars} {display_title}" 275 276 # Transform H2-H6 levels (Pandoc's == syntax) 277 content = re.sub(r'\n(={2,6})\s+(.*)', heading_anchor_logic, content) 278 279 # Heading cleanup 280 content = re.sub(r'IDSHIELDSTART.*?IDSHIELDEND', '', content) 281 content = re.sub(r'^={6,}\s+', r'===== ', content, flags=re.M) 282 283 # C. Apply Global Attributes to the body content AFTER headings are locked 284 content = self._apply_global_attributes(content) 285 286 # --- 3. CONSTRUCT HEADER BLOCK --- 287 header_lines = [ 288 f"[#{title_slug}]", 289 f"= {title_text}", 290 ":idprefix:", 291 ":idseparator: -" 292 ] 293 294 # Inject YAML metadata 295 for key, value in self.metadata.items(): 296 if key.lower() != "title": 297 header_lines.append(f":{key}: {value}") 298 299 header_lines.append(f":revdate: {today}") 300 301 # Add global Antora headers from config 302 antora_cfg = self.config.get("antora", {}) 303 header_lines.extend(antora_cfg.get("headers", [])) 304 header_block = "\n".join(header_lines) + "\n\n" 305 306 # --- 4. MARKER RESTORATION & CLEANUP --- 307 # Restore JSON Components 308 if hasattr(self, 'protected_json'): 309 for i, original in enumerate(self.protected_json): 310 content = content.replace(f"JSONP{i}PROTECT", original) 311 312 # Clean Pandoc artifacts 313 content = content.replace("++_++", "_").replace("++{++", "{").replace("++}++", "}") 314 content = content.replace("++{{++", "{{").replace("++}}++", "}}") 315 content = content.replace("++<++", "<").replace("++>++", ">") 316 content = content.replace("++*++", "*").replace("++_++", "_") 317 318 # Restore video embeds 319 content = re.sub(r'VIDEOTOKEN([a-zA-Z0-9_-]+)', r'video::\1[youtube]', content) 320 content = content.replace('’', "'").replace('‘', "'").replace('“', '"').replace('”', '"').replace('…', '...') 321 322 # Apply cleanup regex from config 323 cleanup = self.conv_cfg.get("cleanup_regex", []) 324 for c in cleanup: 325 flags = re.M if c.get("flags") == "M" else 0 326 regex = c.get("regex") 327 replacement = c.get("replacement") 328 329 if c.get("hook") == "uppercase_label": 330 def uppercase_hook(m: Match) -> str: 331 return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n====" 332 content = re.sub(regex, uppercase_hook, content, flags=flags) 333 else: 334 content = re.sub(regex, replacement, content, flags=flags) 335 336 # --- 5. ADMONITION PROMOTION --- 337 def promote_admo(m: Match) -> str: 338 return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n====" 339 340 content = re.sub(r'(?i)^\*?(Note|Warning|Tip|Caution|Important|IMPORTANT)[:]?\*?[:]?\s+(.*)$', promote_admo, content, flags=re.M) 341 342 # --- 6. DYNAMIC RESTORATIONS --- 343 restorations = self.conv_cfg.get("restoration_patterns", []) 344 for r in restorations: 345 regex, replacement = r.get("regex"), r.get("replacement") 346 if r.get("hook") == "restore_spaces": 347 def restore_hook(m: Match) -> str: 348 full_block = m.group(1) 349 parts = full_block.split("SHIELDSEP", 1) if "SHIELDSEP" in full_block else full_block.split("\n", 1) 350 title = parts[0].replace('PROTECTSPACE', ' ').strip() 351 body = parts[1].strip() if len(parts) > 1 else "" 352 return f".{title}\n[%collapsible]\n======\n{body}\n======" 353 content = re.sub(regex, restore_hook, content, flags=re.S) 354 else: 355 mapping = r.get("map") 356 if mapping: 357 for key, val in mapping.items(): 358 content = re.sub(regex.replace("{key}", key), replacement.replace("{val}", val), content, flags=re.S) 359 else: 360 content = re.sub(regex, replacement, content, flags=re.S) 361 362 # --- 7. ANTORA NORMALIZATION (Xrefs & Images) --- 363 # Image path cleanup: Strip /images/ and ensure double colon '::' for block images 364 content = re.sub(r'image:/?images/(.*?)\[', r'image::\1[', content) 365 366 # Ensure scaling is applied even if the path was already clean 367 content = re.sub(r'image::([^\[]+)\[\]', r'image::\1[pdfwidth=100%,scalewidth=100%]', content) 368 369 # Antora Xref Normalization: Convert Pandoc's link/xref syntax into clean Antora xrefs. 370 def antora_xref_logic(m: Match) -> str: 371 raw_path = m.group(1) or "" 372 anchor = m.group(2) or "" 373 path = raw_path.replace(".md", "").replace(".adoc", "").replace("./", "").strip("/") 374 if "inputs/" in path: 375 path = path.split("inputs/")[-1] 376 if path: 377 path = f"{path}.adoc" 378 cl_anchor = "" 379 if anchor: 380 cl_anchor = "#" + anchor.replace("#", "").lower() 381 return f"xref:{path}{cl_anchor}" 382 383 content = re.sub(r'(?:link:|xref:)(?:\+\+)?([^\[\s#\+]+)?(#[^\[\s\+]+)?(?:\+\+)?', antora_xref_logic, content) 384 385 # --- 8. FINAL CLEANUP --- 386 # Ensure a blank line before lists (both * and .) to prevent squashing 387 # Matches a non-newline character followed by a single newline and a list marker 388 # content = re.sub(r'([^\n])\n([*.])\s', r'\1\n\n\2 ', content) 389 390 content = re.sub(r'\[source,mermaid\]\n----(.*?)----', r'[mermaid]\n....\1....', content, flags=re.DOTALL) 391 content = content.replace("SHIELDADMONSTARTtabs", "[tabs]\n====") 392 content = content.replace("SHIELDADMONEND", "====") 393 content = re.sub(r'^@tab\s+(.*)$', r'\1::', content, flags=re.M) 394 395 return header_block + content.strip()
Finalizes the AsciiDoc output after Pandoc has finished.
Order of Operations:
- Reset ID tracker and prioritize H1 Document Title.
- Process H2-H6 headings with collision avoidance.
- Apply Global Branding Attributes to body prose.
- Construct the Metadata Header.
- Restore shielded blocks and clean Pandoc noise.
- Apply Antora-specific normalization (Xrefs & Image Scaling).
Arguments:
- content (str): The raw AsciiDoc output from Pandoc.
Returns:
str: The finalized AsciiDoc content ready for Antora.
397 def convert_file(self, input_path: Path, output_path: Path) -> None: 398 """ 399 Orchestrates the conversion of a single Markdown file to AsciiDoc. 400 401 Args: 402 input_path (Path): Source Markdown file. 403 output_path (Path): Destination for the raw AsciiDoc. 404 405 Returns: 406 None: Writes the converted content to output_path. 407 """ 408 self.metadata = {} 409 self.discovered_title = None 410 411 raw_md = input_path.read_text(encoding='utf-8') 412 ready_md = self.pre_process_markdown(raw_md) 413 414 # We write to a temporary file so Pandoc sees the 'shielded' version 415 temp_md = input_path.with_suffix('.tmp.md') 416 temp_md.write_text(ready_md, encoding='utf-8') 417 418 try: 419 # Execute Pandoc CLI 420 subprocess.run( 421 [ 422 "pandoc", 423 "-f", "markdown-smart", 424 "-t", "asciidoc", 425 "--shift-heading-level-by=-1", 426 "--wrap=none", 427 "-o", str(output_path), 428 str(temp_md) 429 ], 430 check=True, 431 capture_output=True 432 ) 433 434 # Post-process the Pandoc result to restore shields and finalize headers 435 final_adoc = self.post_process_asciidoc(output_path.read_text(encoding='utf-8')) 436 output_path.write_text(final_adoc, encoding='utf-8') 437 finally: 438 # Tidy up transient files 439 if temp_md.exists(): 440 temp_md.unlink()
Orchestrates the conversion of a single Markdown file to AsciiDoc.
Arguments:
- input_path (Path): Source Markdown file.
- output_path (Path): Destination for the raw AsciiDoc.
Returns:
None: Writes the converted content to output_path.
31class StyleLinter: 32 """ 33 Orchestrates linguistic and style validation using Vale. 34 35 Attributes: 36 target_path (Path): The specific file (adoc/md) to be scanned. 37 config_path (Path): Path to the project's pyproject.toml. 38 vale_ini (Path): The path where the temporary .vale.ini will be created. 39 config (Dict): Loaded configuration specific to the transpiler-pro tool. 40 """ 41 42 def __init__(self, target_path: Path, config_path: Optional[Path] = None): 43 """ 44 Initializes the linter and prepares the configuration environment. 45 46 Args: 47 target_path (Path): File to be validated. 48 config_path (Path, optional): Path to pyproject.toml. Defaults to root. 49 """ 50 self.target_path = target_path 51 self.config_path = config_path or Path("pyproject.toml") 52 53 # We generate the .vale.ini in the same directory as the config for context isolation. 54 self.vale_ini: Path = self.config_path.parent / ".vale.ini" 55 56 self.config = self._load_project_config() 57 58 def _load_project_config(self) -> Dict[str, Any]: 59 """Loads linter-specific settings from the [tool.transpiler-pro] section.""" 60 if not self.config_path.exists(): 61 return {} 62 try: 63 with open(self.config_path, "rb") as f: 64 return tomllib.load(f).get("tool", {}).get("transpiler-pro", {}) 65 except Exception as e: 66 console.print(f"[bold red]Error loading linter config:[/] {e}") 67 return {} 68 69 def setup_config(self) -> None: 70 """ 71 Generates a temporary `.vale.ini` file required by the Vale CLI. 72 73 This method performs two key tasks: 74 1. Dynamic Vocab: Reads 'technical_terms' from the Knowledge Base and 75 writes them to a Vale 'accept.txt' file so they are ignored by 76 spelling checks. 77 2. Config Generation: Injects style paths, alert levels, and rule-sets 78 defined in pyproject.toml into the INI format. 79 """ 80 linter_cfg = self.config.get("linter", {}) 81 # Ensure paths use forward slashes for cross-platform compatibility in Vale 82 styles_root = str(STYLES_DIR.resolve()).replace("\\", "/") 83 84 # --- PHASE 1: DYNAMIC VOCABULARY INJECTION --- 85 kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json") 86 kb_path = Path(kb_setting) 87 vocab_setting = "" 88 89 if kb_path.exists(): 90 try: 91 kb_data = json.loads(kb_path.read_text(encoding="utf-8")) 92 tech_terms = kb_data.get("technical_terms", []) 93 94 if tech_terms: 95 # Vale expects a specific folder structure for Vocabularies 96 vocab_dir = STYLES_DIR / "vocabularies" / "Project" 97 vocab_dir.mkdir(parents=True, exist_ok=True) 98 accept_file = vocab_dir / "accept.txt" 99 100 # Store terms in the accepted list 101 accept_file.write_text("\n".join(tech_terms), encoding="utf-8") 102 vocab_setting = "Vocab = Project" 103 except Exception as e: 104 console.print(f"[yellow]⚠️ Warning:[/] Vocabulary injection failed: {e}") 105 106 # --- PHASE 2: INI CONSTRUCTION --- 107 styles = linter_cfg.get("styles", ["Vale", "common", "asciidoc"]) 108 styles_str = ", ".join(styles) 109 min_level = linter_cfg.get("min_alert_level", "suggestion") 110 111 # Construct the Vale configuration string 112 config_raw = f""" 113 StylesPath = {styles_root} 114 MinAlertLevel = {min_level} 115 {vocab_setting} 116 117 [*.{{adoc,md}}] 118 BasedOnStyles = {styles_str} 119 120 # Use the Asciidoctor parser for accurate block identification 121 asciidoctor = true 122 """ 123 124 self.vale_ini.write_text(textwrap.dedent(config_raw).strip()) 125 126 def _extract_suggestion(self, issue: Dict[str, Any]) -> str: 127 """ 128 Extracts a viable repair suggestion from a Vale violation. 129 130 Vale reports often include 'Action' parameters (e.g., the correct 131 spelling). If those aren't available, this method uses regex patterns 132 from pyproject.toml to "scrape" the suggestion out of the error message. 133 """ 134 action_params = issue.get("Action", {}).get("Params", []) 135 patterns_cfg = self.config.get("patterns", {}) 136 ignored = patterns_cfg.get("ignored_placeholders", []) 137 138 # Priority 1: Check Vale's native suggestion parameters 139 if action_params: 140 candidate = str(action_params[0]) 141 if candidate not in ignored: 142 return candidate 143 144 # Priority 2: Scrape suggestions from the Message text using Regex 145 # e.g., Message: "Use 'SUSE' instead of 'suse'" -> Extracts 'SUSE' 146 search_pool = issue.get("Description", "") + " " + issue.get("Message", "") 147 pattern = patterns_cfg.get("suggestion_extraction", r"['\"‘“’](.*?)['\"’]") 148 149 if pattern and search_pool.strip(): 150 match = re.search(pattern, search_pool) 151 if match: 152 return match.group(1) 153 154 return "" 155 156 def run(self) -> Dict[str, List[Dict[str, Any]]]: 157 """ 158 Executes the Vale CLI and returns a structured map of findings. 159 160 Returns: 161 Dict: Key is file path, Value is a list of violation dictionaries 162 containing Line, Check ID, Severity, and Suggestion. 163 """ 164 try: 165 abs_target = str(self.target_path.resolve()) 166 167 # Execute Vale in JSON mode for programmatic parsing 168 result = subprocess.run( 169 ["vale", "--config", str(self.vale_ini.resolve()), "--output=JSON", abs_target], 170 capture_output=True, 171 text=True, 172 check=False 173 ) 174 175 if not result.stdout or result.stdout.strip() == "": 176 return {} 177 178 raw_data = json.loads(result.stdout) 179 processed_findings = {} 180 181 # Convert raw Vale schema to Transpiler-Pro's internal repair schema 182 for file_path, file_issues in raw_data.items(): 183 processed_findings[file_path] = [] 184 for issue in file_issues: 185 processed_findings[file_path].append({ 186 "Line": issue.get("Line"), 187 "Check": issue.get("Check"), 188 "Severity": issue.get("Severity"), 189 "Message": issue.get("Message"), 190 "Description": issue.get("Description", ""), 191 "Suggestion": self._extract_suggestion(issue) 192 }) 193 194 return processed_findings 195 196 except (FileNotFoundError, json.JSONDecodeError, subprocess.SubprocessError) as e: 197 console.print(f"[bold red]Linter Execution Error:[/] {e}") 198 return {} 199 200 def display_report(self, data: Dict[str, List[Dict[str, Any]]]) -> None: 201 """ 202 Renders a user-friendly report of the findings. 203 The actual visual table is commented out to allow CLI orchestration 204 to handle final output density, but the logic remains for debugging. 205 """ 206 if not data or not any(data.values()): 207 console.print("\n✨ [bold green]Quality Check Passed: Document meets all style guide requirements.[/]") 208 return 209 210 # Table rendering code... 211 # Theme-based coloring for different alert levels 212 # linter_cfg = self.config.get("linter", {}) 213 # theme = linter_cfg.get("theme", {"error": "red", "warning": "yellow", "suggestion": "blue"}) 214 # table = Table(title="Style Guide Validation Report", title_style="bold cyan") 215 # table.add_column("Line", style="magenta", justify="right") 216 # table.add_column("Severity", style="bold") 217 # table.add_column("Message", style="white") 218 # table.add_column("Rule ID", style="yellow") 219 220 # for _, issues in data.items(): 221 # for issue in issues: 222 # sev = issue['Severity'] 223 # color = theme.get(sev.lower(), "white") 224 225 # table.add_row( 226 # str(issue['Line']), 227 # f"[{color}]{sev}[/]", 228 # issue['Message'], 229 # issue['Check'] 230 # ) 231 232 # console.print(table)
Orchestrates linguistic and style validation using Vale.
Attributes:
- target_path (Path): The specific file (adoc/md) to be scanned.
- config_path (Path): Path to the project's pyproject.toml.
- vale_ini (Path): The path where the temporary .vale.ini will be created.
- config (Dict): Loaded configuration specific to the transpiler-pro tool.
42 def __init__(self, target_path: Path, config_path: Optional[Path] = None): 43 """ 44 Initializes the linter and prepares the configuration environment. 45 46 Args: 47 target_path (Path): File to be validated. 48 config_path (Path, optional): Path to pyproject.toml. Defaults to root. 49 """ 50 self.target_path = target_path 51 self.config_path = config_path or Path("pyproject.toml") 52 53 # We generate the .vale.ini in the same directory as the config for context isolation. 54 self.vale_ini: Path = self.config_path.parent / ".vale.ini" 55 56 self.config = self._load_project_config()
Initializes the linter and prepares the configuration environment.
Arguments:
- target_path (Path): File to be validated.
- config_path (Path, optional): Path to pyproject.toml. Defaults to root.
69 def setup_config(self) -> None: 70 """ 71 Generates a temporary `.vale.ini` file required by the Vale CLI. 72 73 This method performs two key tasks: 74 1. Dynamic Vocab: Reads 'technical_terms' from the Knowledge Base and 75 writes them to a Vale 'accept.txt' file so they are ignored by 76 spelling checks. 77 2. Config Generation: Injects style paths, alert levels, and rule-sets 78 defined in pyproject.toml into the INI format. 79 """ 80 linter_cfg = self.config.get("linter", {}) 81 # Ensure paths use forward slashes for cross-platform compatibility in Vale 82 styles_root = str(STYLES_DIR.resolve()).replace("\\", "/") 83 84 # --- PHASE 1: DYNAMIC VOCABULARY INJECTION --- 85 kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json") 86 kb_path = Path(kb_setting) 87 vocab_setting = "" 88 89 if kb_path.exists(): 90 try: 91 kb_data = json.loads(kb_path.read_text(encoding="utf-8")) 92 tech_terms = kb_data.get("technical_terms", []) 93 94 if tech_terms: 95 # Vale expects a specific folder structure for Vocabularies 96 vocab_dir = STYLES_DIR / "vocabularies" / "Project" 97 vocab_dir.mkdir(parents=True, exist_ok=True) 98 accept_file = vocab_dir / "accept.txt" 99 100 # Store terms in the accepted list 101 accept_file.write_text("\n".join(tech_terms), encoding="utf-8") 102 vocab_setting = "Vocab = Project" 103 except Exception as e: 104 console.print(f"[yellow]⚠️ Warning:[/] Vocabulary injection failed: {e}") 105 106 # --- PHASE 2: INI CONSTRUCTION --- 107 styles = linter_cfg.get("styles", ["Vale", "common", "asciidoc"]) 108 styles_str = ", ".join(styles) 109 min_level = linter_cfg.get("min_alert_level", "suggestion") 110 111 # Construct the Vale configuration string 112 config_raw = f""" 113 StylesPath = {styles_root} 114 MinAlertLevel = {min_level} 115 {vocab_setting} 116 117 [*.{{adoc,md}}] 118 BasedOnStyles = {styles_str} 119 120 # Use the Asciidoctor parser for accurate block identification 121 asciidoctor = true 122 """ 123 124 self.vale_ini.write_text(textwrap.dedent(config_raw).strip())
Generates a temporary .vale.ini file required by the Vale CLI.
This method performs two key tasks:
- Dynamic Vocab: Reads 'technical_terms' from the Knowledge Base and writes them to a Vale 'accept.txt' file so they are ignored by spelling checks.
- Config Generation: Injects style paths, alert levels, and rule-sets defined in pyproject.toml into the INI format.
156 def run(self) -> Dict[str, List[Dict[str, Any]]]: 157 """ 158 Executes the Vale CLI and returns a structured map of findings. 159 160 Returns: 161 Dict: Key is file path, Value is a list of violation dictionaries 162 containing Line, Check ID, Severity, and Suggestion. 163 """ 164 try: 165 abs_target = str(self.target_path.resolve()) 166 167 # Execute Vale in JSON mode for programmatic parsing 168 result = subprocess.run( 169 ["vale", "--config", str(self.vale_ini.resolve()), "--output=JSON", abs_target], 170 capture_output=True, 171 text=True, 172 check=False 173 ) 174 175 if not result.stdout or result.stdout.strip() == "": 176 return {} 177 178 raw_data = json.loads(result.stdout) 179 processed_findings = {} 180 181 # Convert raw Vale schema to Transpiler-Pro's internal repair schema 182 for file_path, file_issues in raw_data.items(): 183 processed_findings[file_path] = [] 184 for issue in file_issues: 185 processed_findings[file_path].append({ 186 "Line": issue.get("Line"), 187 "Check": issue.get("Check"), 188 "Severity": issue.get("Severity"), 189 "Message": issue.get("Message"), 190 "Description": issue.get("Description", ""), 191 "Suggestion": self._extract_suggestion(issue) 192 }) 193 194 return processed_findings 195 196 except (FileNotFoundError, json.JSONDecodeError, subprocess.SubprocessError) as e: 197 console.print(f"[bold red]Linter Execution Error:[/] {e}") 198 return {}
Executes the Vale CLI and returns a structured map of findings.
Returns:
Dict: Key is file path, Value is a list of violation dictionaries containing Line, Check ID, Severity, and Suggestion.
200 def display_report(self, data: Dict[str, List[Dict[str, Any]]]) -> None: 201 """ 202 Renders a user-friendly report of the findings. 203 The actual visual table is commented out to allow CLI orchestration 204 to handle final output density, but the logic remains for debugging. 205 """ 206 if not data or not any(data.values()): 207 console.print("\n✨ [bold green]Quality Check Passed: Document meets all style guide requirements.[/]") 208 return 209 210 # Table rendering code... 211 # Theme-based coloring for different alert levels 212 # linter_cfg = self.config.get("linter", {}) 213 # theme = linter_cfg.get("theme", {"error": "red", "warning": "yellow", "suggestion": "blue"}) 214 # table = Table(title="Style Guide Validation Report", title_style="bold cyan") 215 # table.add_column("Line", style="magenta", justify="right") 216 # table.add_column("Severity", style="bold") 217 # table.add_column("Message", style="white") 218 # table.add_column("Rule ID", style="yellow") 219 220 # for _, issues in data.items(): 221 # for issue in issues: 222 # sev = issue['Severity'] 223 # color = theme.get(sev.lower(), "white") 224 225 # table.add_row( 226 # str(issue['Line']), 227 # f"[{color}]{sev}[/]", 228 # issue['Message'], 229 # issue['Check'] 230 # ) 231 232 # console.print(table)
Renders a user-friendly report of the findings. The actual visual table is commented out to allow CLI orchestration to handle final output density, but the logic remains for debugging.
27class StyleFixer: 28 """ 29 NLP-enhanced repair engine that learns and persists style corrections. 30 31 Attributes: 32 config (Dict): Tool configuration extracted from pyproject.toml. 33 kb_path (Path): Location of the persistent JSON knowledge base. 34 kb (Dict): The internal memory of the fixer (Branding + Learned terms). 35 nlp: The spaCy language model used for linguistic context checks. 36 """ 37 38 def __init__(self, config_path: Optional[Path] = None) -> None: 39 """ 40 Initializes the fixer and loads the persistent Knowledge Base. 41 42 Args: 43 config_path (Path, optional): Custom path to pyproject.toml. 44 """ 45 self.config_path = config_path or Path("pyproject.toml") 46 self.config = self._load_config() 47 48 # Load the Knowledge Base (JSON) which stores branding and learned words. 49 kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json") 50 self.kb_path = Path(kb_setting) 51 self.kb = self._load_kb() 52 53 try: 54 self.nlp = spacy.load("en_core_web_sm") 55 except Exception: 56 # Fallback if spaCy is missing; some tense-shifting features may be limited. 57 self.nlp = None 58 59 def _load_config(self) -> Dict[str, Any]: 60 """Reads the [tool.transpiler-pro] section from the project TOML.""" 61 if not self.config_path.exists(): 62 return {} 63 try: 64 with open(self.config_path, "rb") as f: 65 return tomllib.load(f).get("tool", {}).get("transpiler-pro", {}) 66 except (tomllib.TOMLDecodeError, OSError): 67 return {} 68 69 def _load_kb(self) -> Dict[str, Any]: 70 """Loads the JSON brain. Initializes empty branding/learned dicts if missing.""" 71 if self.kb_path.exists(): 72 try: 73 return json.loads(self.kb_path.read_text(encoding="utf-8")) 74 except (json.JSONDecodeError, OSError): 75 pass 76 return {"branding": {}, "learned": {}} 77 78 def _save_kb(self) -> None: 79 """Persists learned corrections to disk for future pipeline runs.""" 80 try: 81 self.kb_path.parent.mkdir(parents=True, exist_ok=True) 82 self.kb_path.write_text(json.dumps(self.kb, indent=4), encoding="utf-8") 83 except Exception as e: 84 console.print(f"[red]Error saving Knowledge Base:[/] {e}") 85 86 def _get_progressive_verb(self, verb_token) -> str: 87 """ 88 Logic to convert a verb to its '-ing' form. 89 90 Prioritizes the 'special_verbs' table in pyproject.toml to handle 91 irregular conjugations (e.g., 'stop' -> 'stopping') before falling 92 back to standard English suffix rules. 93 """ 94 lemma = verb_token.lemma_.lower() 95 grammar_cfg = self.config.get("grammar", {}) 96 special = grammar_cfg.get("special_verbs", {}) 97 98 if lemma in special: 99 return special[lemma] 100 101 # Standard -ing rules 102 if lemma.endswith("e") and not lemma.endswith("ee"): 103 return lemma[:-1] + "ing" 104 # CVC rule: Double the consonant (e.g., run -> running) 105 if len(lemma) > 2 and lemma[-1] not in "aeiou" and lemma[-2] in "aeiou" and lemma[-3] not in "aeiou": 106 return lemma + lemma[-1] + "ing" 107 return lemma + "ing" 108 109 def _fix_tense(self, line: str) -> str: 110 """ 111 Standard Tense Shifter: "We will test" -> "We are testing". 112 Note: This is an legacy/alternative shifter; primary tense shifting 113 is now handled by the more advanced LinguisticEngine in repair.py. 114 """ 115 if not self.nlp: 116 return line 117 doc = self.nlp(line) 118 working_line = line 119 for token in doc: 120 if token.text.lower() == "will": 121 main_verb = token.head 122 if main_verb.pos_ == "VERB": 123 # Determine plurality for correct aux verb (is vs are) 124 subjects = [w for w in main_verb.lefts if "subj" in w.dep_] 125 is_plural = any("Number=Plur" in str(s.morph) or s.text.lower() in ["we", "they", "you"] for s in subjects) 126 aux = "are" if is_plural else "is" 127 prog = self._get_progressive_verb(main_verb) 128 working_line = re.sub(rf"\b{token.text}\s+{main_verb.text}\b", f"{aux} {prog}", working_line, flags=re.IGNORECASE) 129 return working_line 130 131 def fix_file(self, file_path: Path, violations: List[Dict[str, Any]]) -> int: 132 """ 133 The main repair loop. Iterates through line-specific violations and 134 applies branding and style corrections. 135 136 Args: 137 file_path (Path): Path to the generated AsciiDoc file. 138 violations (List[Dict]): List of findings from the Linter. 139 140 Returns: 141 int: Number of lines successfully modified. 142 """ 143 if not file_path.exists(): 144 return 0 145 content = file_path.read_text(encoding="utf-8").splitlines() 146 total_fixes = 0 147 148 # Group issues by line number for efficient processing 149 line_map = defaultdict(list) 150 for v in violations: 151 line_map[v.get("Line", 0)].append(v) 152 153 patterns = self.config.get("patterns", {}) 154 extract_re = patterns.get("suggestion_extraction", r"'(.*?)'") 155 remove_trigger = patterns.get("removal_trigger", "removing") 156 instead_of_trigger = patterns.get("instead_of_trigger", "instead of") 157 158 # Current branding context (Permanent + Learned during this session) 159 session_branding = {**self.kb.get("learned", {}), **self.kb.get("automated_fixes", {})} 160 161 # Process lines in reverse order to ensure line-length changes don't shift indices 162 for line_num in sorted(line_map.keys(), reverse=True): 163 idx = line_num - 1 164 if idx < 0 or idx >= len(content): 165 continue 166 167 working_line = content[idx] 168 original_line = working_line 169 170 # --- PHASE 1: LINTER-DRIVEN REPAIRS --- 171 for issue in line_map[line_num]: 172 msg = issue.get("Message", "") 173 check_id = issue.get("Check", "") 174 suggestion = issue.get("Suggestion", "") 175 176 # 1. Branding Sync (e.g., Use 'SUSE' instead of 'suse') 177 for wrong, correct in session_branding.items(): 178 if f"'{wrong}'" in msg.lower() or f"‘{wrong}’" in msg.lower(): 179 working_line = re.sub(rf"\b{re.escape(wrong)}\b", correct, working_line, flags=re.IGNORECASE) 180 181 # 2. Surgical Removal (e.g., "Note that...", "Actually...") 182 if remove_trigger in msg.lower() or "Editorializing" in check_id: 183 target = suggestion if suggestion else (re.findall(extract_re, msg)[0] if re.findall(extract_re, msg) else None) 184 if target: 185 working_line = re.sub(rf"\b{re.escape(target)}\b\s?", "", working_line, flags=re.IGNORECASE) 186 187 # 3. Phrasal Substitution (e.g., "Use 'X' instead of 'Y'") 188 elif instead_of_trigger in msg.lower(): 189 if suggestion: 190 m = re.findall(extract_re, msg) 191 wrong_term = m[1] if len(m) >= 2 else (m[0] if m else "") 192 if wrong_term: 193 # --- GUARDRAIL: Let repair.py handle complex tense shifts --- 194 if "will" in wrong_term.lower() or "will" in msg.lower(): 195 continue 196 working_line = re.sub(rf"\b{re.escape(wrong_term)}\b", suggestion, working_line, flags=re.IGNORECASE) 197 198 # 4. Auto-Learning: Capture spelling fixes into the Knowledge Base 199 elif "Spelling" in check_id: 200 if suggestion and suggestion.lower() not in ["spelling", "spellings", "learned"]: 201 match = re.findall(extract_re, msg) 202 word_to_fix = match[0] if match else "" 203 if word_to_fix: 204 working_line = re.sub(rf"\b{re.escape(word_to_fix)}\b", suggestion, working_line) 205 # Persist this correction for future automation 206 if word_to_fix.lower() not in session_branding: 207 self.kb["learned"][word_to_fix.lower()] = suggestion 208 209 # --- PHASE 2: GLOBAL BRANDING & FORMATTING GUARDRAILS --- 210 211 # 1. Branding Guardrail: Apply core branding safely (no URL/Path corruption) 212 for wrong, correct in self.kb.get("automated_fixes", {}).items(): 213 # Negative lookarounds (?<![\/-]) prevent breaking paths like /img/suse-logo.svg 214 pattern = rf"(?<![\/-])\b{re.escape(wrong)}\b(?![\/-])" 215 working_line = re.sub(pattern, correct, working_line, flags=re.IGNORECASE) 216 217 # 2. Fragment Healer: Ensure sentences start with capital letters 218 # Ignores lines starting with AsciiDoc technical syntax 219 if not re.match(r'^(image::|video::|xref:|link:|http|\[|:)', working_line, flags=re.IGNORECASE): 220 working_line = re.sub(r'(^|\.\s+)([a-z])', lambda m: m.group(1) + m.group(2).upper(), working_line) 221 222 # Update line in content if modifications were made 223 if working_line != original_line: 224 content[idx] = working_line 225 total_fixes += 1 226 227 # Write corrected content back and update the JSON brain 228 file_path.write_text("\n".join(content), encoding="utf-8") 229 self._save_kb() 230 return total_fixes
NLP-enhanced repair engine that learns and persists style corrections.
Attributes:
- config (Dict): Tool configuration extracted from pyproject.toml.
- kb_path (Path): Location of the persistent JSON knowledge base.
- kb (Dict): The internal memory of the fixer (Branding + Learned terms).
- nlp: The spaCy language model used for linguistic context checks.
38 def __init__(self, config_path: Optional[Path] = None) -> None: 39 """ 40 Initializes the fixer and loads the persistent Knowledge Base. 41 42 Args: 43 config_path (Path, optional): Custom path to pyproject.toml. 44 """ 45 self.config_path = config_path or Path("pyproject.toml") 46 self.config = self._load_config() 47 48 # Load the Knowledge Base (JSON) which stores branding and learned words. 49 kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json") 50 self.kb_path = Path(kb_setting) 51 self.kb = self._load_kb() 52 53 try: 54 self.nlp = spacy.load("en_core_web_sm") 55 except Exception: 56 # Fallback if spaCy is missing; some tense-shifting features may be limited. 57 self.nlp = None
Initializes the fixer and loads the persistent Knowledge Base.
Arguments:
- config_path (Path, optional): Custom path to pyproject.toml.
131 def fix_file(self, file_path: Path, violations: List[Dict[str, Any]]) -> int: 132 """ 133 The main repair loop. Iterates through line-specific violations and 134 applies branding and style corrections. 135 136 Args: 137 file_path (Path): Path to the generated AsciiDoc file. 138 violations (List[Dict]): List of findings from the Linter. 139 140 Returns: 141 int: Number of lines successfully modified. 142 """ 143 if not file_path.exists(): 144 return 0 145 content = file_path.read_text(encoding="utf-8").splitlines() 146 total_fixes = 0 147 148 # Group issues by line number for efficient processing 149 line_map = defaultdict(list) 150 for v in violations: 151 line_map[v.get("Line", 0)].append(v) 152 153 patterns = self.config.get("patterns", {}) 154 extract_re = patterns.get("suggestion_extraction", r"'(.*?)'") 155 remove_trigger = patterns.get("removal_trigger", "removing") 156 instead_of_trigger = patterns.get("instead_of_trigger", "instead of") 157 158 # Current branding context (Permanent + Learned during this session) 159 session_branding = {**self.kb.get("learned", {}), **self.kb.get("automated_fixes", {})} 160 161 # Process lines in reverse order to ensure line-length changes don't shift indices 162 for line_num in sorted(line_map.keys(), reverse=True): 163 idx = line_num - 1 164 if idx < 0 or idx >= len(content): 165 continue 166 167 working_line = content[idx] 168 original_line = working_line 169 170 # --- PHASE 1: LINTER-DRIVEN REPAIRS --- 171 for issue in line_map[line_num]: 172 msg = issue.get("Message", "") 173 check_id = issue.get("Check", "") 174 suggestion = issue.get("Suggestion", "") 175 176 # 1. Branding Sync (e.g., Use 'SUSE' instead of 'suse') 177 for wrong, correct in session_branding.items(): 178 if f"'{wrong}'" in msg.lower() or f"‘{wrong}’" in msg.lower(): 179 working_line = re.sub(rf"\b{re.escape(wrong)}\b", correct, working_line, flags=re.IGNORECASE) 180 181 # 2. Surgical Removal (e.g., "Note that...", "Actually...") 182 if remove_trigger in msg.lower() or "Editorializing" in check_id: 183 target = suggestion if suggestion else (re.findall(extract_re, msg)[0] if re.findall(extract_re, msg) else None) 184 if target: 185 working_line = re.sub(rf"\b{re.escape(target)}\b\s?", "", working_line, flags=re.IGNORECASE) 186 187 # 3. Phrasal Substitution (e.g., "Use 'X' instead of 'Y'") 188 elif instead_of_trigger in msg.lower(): 189 if suggestion: 190 m = re.findall(extract_re, msg) 191 wrong_term = m[1] if len(m) >= 2 else (m[0] if m else "") 192 if wrong_term: 193 # --- GUARDRAIL: Let repair.py handle complex tense shifts --- 194 if "will" in wrong_term.lower() or "will" in msg.lower(): 195 continue 196 working_line = re.sub(rf"\b{re.escape(wrong_term)}\b", suggestion, working_line, flags=re.IGNORECASE) 197 198 # 4. Auto-Learning: Capture spelling fixes into the Knowledge Base 199 elif "Spelling" in check_id: 200 if suggestion and suggestion.lower() not in ["spelling", "spellings", "learned"]: 201 match = re.findall(extract_re, msg) 202 word_to_fix = match[0] if match else "" 203 if word_to_fix: 204 working_line = re.sub(rf"\b{re.escape(word_to_fix)}\b", suggestion, working_line) 205 # Persist this correction for future automation 206 if word_to_fix.lower() not in session_branding: 207 self.kb["learned"][word_to_fix.lower()] = suggestion 208 209 # --- PHASE 2: GLOBAL BRANDING & FORMATTING GUARDRAILS --- 210 211 # 1. Branding Guardrail: Apply core branding safely (no URL/Path corruption) 212 for wrong, correct in self.kb.get("automated_fixes", {}).items(): 213 # Negative lookarounds (?<![\/-]) prevent breaking paths like /img/suse-logo.svg 214 pattern = rf"(?<![\/-])\b{re.escape(wrong)}\b(?![\/-])" 215 working_line = re.sub(pattern, correct, working_line, flags=re.IGNORECASE) 216 217 # 2. Fragment Healer: Ensure sentences start with capital letters 218 # Ignores lines starting with AsciiDoc technical syntax 219 if not re.match(r'^(image::|video::|xref:|link:|http|\[|:)', working_line, flags=re.IGNORECASE): 220 working_line = re.sub(r'(^|\.\s+)([a-z])', lambda m: m.group(1) + m.group(2).upper(), working_line) 221 222 # Update line in content if modifications were made 223 if working_line != original_line: 224 content[idx] = working_line 225 total_fixes += 1 226 227 # Write corrected content back and update the JSON brain 228 file_path.write_text("\n".join(content), encoding="utf-8") 229 self._save_kb() 230 return total_fixes
The main repair loop. Iterates through line-specific violations and applies branding and style corrections.
Arguments:
- file_path (Path): Path to the generated AsciiDoc file.
- violations (List[Dict]): List of findings from the Linter.
Returns:
int: Number of lines successfully modified.
27def get_info() -> str: 28 """ 29 Returns the basic identity string for the package. 30 31 Returns: 32 str: A formatted string containing the tool name, version, and purpose. 33 """ 34 return f"Transpiler-Pro v{__version__} - Enterprise Documentation Engine"
Returns the basic identity string for the package.
Returns:
str: A formatted string containing the tool name, version, and purpose.