transpiler_pro

Location: src/transpiler_pro/__init__.py

Description: Transpiler-Pro - Enterprise Documentation Pipeline.

This package provides a comprehensive engine for converting Markdown to AsciiDoc while enforcing linguistic standards and branding consistency.

Core Features:

  1. DocConverter: Shield-Transpile-Restore logic using Pandoc.
  2. StyleLinter: Orchestration of Vale-based style validation.
  3. StyleFixer: NLP-driven 'Auto-Heal' for linguistic repair.

The pipeline is entirely data-driven, utilizing settings defined in pyproject.toml.

 1"""
 2Location: src/transpiler_pro/__init__.py
 3
 4Description: Transpiler-Pro - Enterprise Documentation Pipeline.
 5
 6This package provides a comprehensive engine for converting Markdown to 
 7AsciiDoc while enforcing linguistic standards and branding consistency.
 8
 9Core Features:
10
111. **DocConverter**: Shield-Transpile-Restore logic using Pandoc.
122. **StyleLinter**: Orchestration of Vale-based style validation.
133. **StyleFixer**: NLP-driven 'Auto-Heal' for linguistic repair.
14
15The pipeline is entirely data-driven, utilizing settings defined in `pyproject.toml`.
16"""
17
18from .core.converter import DocConverter
19from .core.linter import StyleLinter
20from .core.fixer import StyleFixer
21
22__version__ = "1.0.0"
23__author__ = "Sushant Gaurav"
24
25
26def get_info() -> str:
27    """
28    Returns the basic identity string for the package.
29
30    Returns:
31        str: A formatted string containing the tool name, version, and purpose.
32    """
33    return f"Transpiler-Pro v{__version__} - Enterprise Documentation Engine"
34
35
36# Defines the public API exposed at the top level of the package
37__all__ = [
38    "DocConverter", 
39    "StyleLinter", 
40    "StyleFixer", 
41    "__version__", 
42    "get_info"
43]
class DocConverter:
 22class DocConverter:
 23    """
 24    A pattern-driven engine that transforms Markdown into Enterprise AsciiDoc.
 25    
 26    Attributes:
 27        config (Dict): Configuration extracted from pyproject.toml.
 28        metadata (Dict): Extracted frontmatter (YAML) from the source file.
 29        discovered_title (str): The inferred document title (H1 or YAML).
 30    """
 31
 32    def __init__(self, config_path: Optional[Path] = None):
 33        """Initializes the converter and loads conversion patterns."""
 34        self.config_path = config_path or Path("pyproject.toml")
 35        self.config = self._load_project_config()
 36        self.conv_cfg = self.config.get("conversions", {})
 37        self.metadata: Dict[str, Any] = {}
 38        self.discovered_title = None
 39        self.used_ids: Set[str] = set()
 40        self.protected_json: List[str] = []
 41        # --- SUSE Branding Attribute Map ---
 42        # Format: "Raw Text to Find": "{attribute-variable-name}"
 43        # ORDER MATTERS: Longest strings first to prevent partial matching.
 44        self.attribute_map = {
 45            "SUSE® Rancher Prime: Admission Policy Manager": "{kubewarden-product-name}",
 46            "SUSE® Rancher Prime: Continuous Delivery": "{fleet-product-name}",
 47            "SUSE® Rancher Prime: OS Manager": "{elemental-product-name}",
 48            "SUSE® Rancher Prime: Cluster API": "{turtles-product-name}",
 49            "SUSE® Rancher Prime: K3s": "{k3s-product-name}",
 50            "SUSE® Rancher Prime: RKE2": "{rke2-product-name}",
 51            "SUSE® Rancher Prime": "{rancher-product-name-tm}",
 52            "SUSE Rancher Prime": "{rancher-product-name}",
 53            "SUSE® Virtualization": "{harvester-product-name-tm}",
 54            "SUSE Virtualization": "{harvester-product-name}",
 55            "SUSE® Storage": "{longhorn-product-name-tm}",
 56            "SUSE Storage": "{longhorn-product-name}",
 57            "SUSE® Security": "{neuvector-product-name}",
 58            "SUSE® Losant": "{losant-product-name}",
 59            "SUSE Losant": "{losant-product-name}"
 60        }
 61
 62    def _load_project_config(self) -> Dict[str, Any]:
 63        """Loads the [tool.transpiler-pro] configuration block.
 64        
 65        Args:
 66            config_path (Path): Path to the pyproject.toml file.
 67            
 68        Returns:
 69            Dict: The configuration dictionary for transpiler-pro.
 70        """
 71        if not self.config_path.exists():
 72            return {}
 73        try:
 74            with open(self.config_path, "rb") as f:
 75                import tomllib
 76                return tomllib.load(f).get("tool", {}).get("transpiler-pro", {})
 77        except Exception:
 78            return {}
 79    
 80    def _apply_global_attributes(self, text: str) -> str:
 81        """
 82        Replaces raw product names with Antora attributes.
 83        Uses negative lookbehind/lookahead to protect URLs and file paths.
 84
 85        Args:
 86            text (str): The input string to process.
 87
 88        Returns:
 89            str: The text with product names replaced by attributes.
 90        """
 91        # Guard clause for empty text to avoid unnecessary processing.
 92        if not text:
 93            return text
 94        
 95        # We iterate through the attribute map and apply replacements. 
 96        # The regex ensures we only replace standalone occurrences of the product names, not when they are part of URLs or file paths.
 97        for raw_name, attr in self.attribute_map.items():
 98            # Refined Regex: Protects URLs (/:) but allows trailing periods in sentences
 99            pattern = rf"(?<![/:])\b{re.escape(raw_name)}\b"
100            text = re.sub(pattern, attr, text)
101
102        return text
103    
104    def _slugify(self, text: str) -> str:
105        """
106        Converts a heading title into a SEO-friendly, unique ID.
107        Example: "Access Keys & Security" -> "access-keys-security"
108
109        Args:
110            text (str): The raw heading text.
111
112        Returns:
113            str: A slugified version suitable for use as an anchor ID.
114        """
115        # 1. Lowercase and strip technical syntax and HTML/JSX tags
116        slug = text.lower()
117        slug = re.sub(r'<[^>]+>', '', slug) # Remove HTML tags
118        slug = re.sub(r'\{#.*?\}', '', slug) # Remove existing MD IDs
119        slug = re.sub(r'[^a-z0-9\s-]', '', slug) # Remove special chars
120        
121        # 2. Replace spaces/multiple dashes/underscores with a single dash
122        slug = re.sub(r'[\s_/-]+', '-', slug).strip('-')
123        
124        # 3. Handle uniqueness within the document (Collision Avoidance)
125        base_slug = slug or "section"
126        final_slug = base_slug
127        counter = 1
128        while final_slug in self.used_ids:
129            final_slug = f"{base_slug}-{counter}"
130            counter += 1
131        
132        self.used_ids.add(final_slug)
133        return final_slug
134
135    def pre_process_markdown(self, content: str) -> str:
136        """
137        Prepares Markdown for Pandoc by shielding modern syntax and extracting metadata.
138
139        Args:
140            content (str): Raw Markdown string.
141
142        Returns:
143            str: "Shielded" Markdown ready for Pandoc.
144        """
145        self.metadata = {}
146        self.discovered_title = None
147        self.used_ids = set()
148        # Initialize storage for JSON components to protect them from Pandoc
149        self.protected_json = []
150
151        # --- 1. CODE BLOCK SHIELDING ---
152        # We protect '#' characters inside code blocks so the Title Scavenger 
153        # doesn't accidentally treat a code comment as the document's H1 title.
154        content = re.sub(r'(`{3}.*?`{3})', lambda m: m.group(1).replace('#', 'HASHSHIELD'), content, flags=re.DOTALL)
155
156        # --- 2. FRONTMATTER EXTRACTION ---
157        # Extracts YAML metadata (title, description, etc.) from the top of the MD file.
158        frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
159        if frontmatter_match:
160            try:
161                yaml_data = yaml.safe_load(frontmatter_match.group(1))
162                if isinstance(yaml_data, dict):
163                    self.metadata = yaml_data
164                content = content[frontmatter_match.end():]
165            except Exception:
166                self.metadata = {}
167
168        # --- 3. TITLE SCAVENGER ---
169        # Logic: Priority 1 is YAML 'title'. Priority 2 is the first H1 (#) found.
170        self.discovered_title = self.metadata.get('title')
171        if not self.discovered_title:
172            h1_match = re.search(r'^#\s+(.*)$', content, re.M)
173            if h1_match:
174                self.discovered_title = h1_match.group(1).strip()
175                # Remove the H1 from body as it will be promoted to the AsciiDoc Document Title (=).
176                content = content.replace(h1_match.group(0), "", 1)
177
178        # Restore shielded hashes after title scavenging is safe.
179        content = content.replace('HASHSHIELD', '#')
180
181        # --- 4. VIDEO & COMPLEX PATTERN SHIELDING ---
182        # Replaces complex HTML/Markdown blocks with tokens that Pandoc will ignore.
183        # This protects <iframe> embeds and custom ':::tabs' blocks.
184        content = re.sub(r'<iframe.*?embed/([^"?\s]+).*?</iframe>', r'VIDEOTOKEN\1', content)
185
186        patterns = self.conv_cfg.get("shielding_patterns", [])
187        for p in patterns:
188            regex = p.get("regex")
189            replacement = p.get("replacement")
190            
191            if p.get("hook") == "protect_spaces":
192                # Special hook for collapsibles to ensure spaces in titles aren't lost.
193                def protect_hook(match: Match) -> str:
194                    title = match.group(1).strip().replace(' ', 'PROTECTSPACE')
195                    body = match.group(2).strip()
196                    return replacement.replace(r"\1", title).replace(r"\2", body)
197                content = re.sub(regex, protect_hook, content, flags=re.S)
198            else:
199                content = re.sub(regex, replacement, content, flags=re.S)
200        
201        # --- JSON COMPONENT SHIELDING ---
202        # Protects <JsonDisplay /> from being mangled into latexmath/footnotes by Pandoc
203        def shield_json_display(match):
204            # Pure alphanumeric placeholder to avoid Pandoc escaping
205            placeholder = f"JSONP{len(self.protected_json)}PROTECT"
206            self.protected_json.append(match.group(1))
207            return placeholder
208
209        content = re.sub(r'(<JsonDisplay.*?\/>)', shield_json_display, content, flags=re.DOTALL)
210
211        # Protect existing Markdown IDs so Pandoc doesn't mangle curly braces
212        content = re.sub(r'\{#(.*?)\}', r'IDSHIELDSTART\1IDSHIELDEND', content)
213
214        return content
215
216    def post_process_asciidoc(self, content: str) -> str:
217        """
218        Finalizes the AsciiDoc output after Pandoc has finished.
219        
220        Order of Operations:
221        1. Reset ID tracker and prioritize H1 Document Title.
222        2. Process H2-H6 headings with collision avoidance.
223        3. Apply Global Branding Attributes to body prose.
224        4. Construct the Metadata Header.
225        5. Restore shielded blocks and clean Pandoc noise.
226        6. Apply Antora-specific normalization (Xrefs & Image Scaling).
227
228        Args:
229            content (str): The raw AsciiDoc output from Pandoc.
230        
231        Returns:
232            str: The finalized AsciiDoc content ready for Antora.
233        """
234        # --- 1. INITIALIZATION & H1 PRIORITY ---
235        self.used_ids = set() 
236        today = datetime.now().strftime("%Y-%m-%d")
237
238        # A. Create the ID Slug from the RAW title BEFORE branding
239        # This prevents ID leakage (e.g., [#suse-storage] instead of [#{longhorn-product-name}])
240        title_slug = self._slugify(self.discovered_title or "untitled")
241
242        # B. Prepare the title for DISPLAY only
243        title_text = self.discovered_title or "Untitled Document"
244        title_text = self._apply_global_attributes(title_text)
245
246        # --- 2. HEADING SLUGGING & NORMALIZATION ---
247        def heading_anchor_logic(match):
248            level_chars = match.group(1)
249            raw_title = match.group(2).strip()
250            
251            # Check for shielded custom IDs from Markdown (e.g., {#my-custom-id})
252            custom_id_match = re.search(r'IDSHIELDSTART(.*?)IDSHIELDEND', raw_title)
253            
254            if custom_id_match:
255                base_id = custom_id_match.group(1)
256                display_title = raw_title.replace(custom_id_match.group(0), "").strip()
257                
258                # Collision avoidance even for custom IDs
259                final_id = base_id
260                counter = 1
261                while final_id in self.used_ids:
262                    final_id = f"{base_id}-{counter}"
263                    counter += 1
264                self.used_ids.add(final_id)
265                # Apply branding to display title after ID is locked
266                display_title = self._apply_global_attributes(display_title)
267            else:
268                # Regular heading: slugify RAW title first for clean SEO
269                final_id = self._slugify(raw_title)
270                # Then brand the display title for the reader
271                display_title = self._apply_global_attributes(raw_title)
272
273            # We return the heading with an explicit anchor to ensure URL stability, even if the title text changes in the future.            
274            return f"\n[#{final_id}]\n{level_chars} {display_title}"
275
276        # Transform H2-H6 levels (Pandoc's == syntax)
277        content = re.sub(r'\n(={2,6})\s+(.*)', heading_anchor_logic, content)
278        
279        # Heading cleanup
280        content = re.sub(r'IDSHIELDSTART.*?IDSHIELDEND', '', content)
281        content = re.sub(r'^={6,}\s+', r'===== ', content, flags=re.M)
282
283        # C. Apply Global Attributes to the body content AFTER headings are locked
284        content = self._apply_global_attributes(content)
285
286        # --- 3. CONSTRUCT HEADER BLOCK ---
287        header_lines = [
288            f"[#{title_slug}]",
289            f"= {title_text}",
290            ":idprefix:",
291            ":idseparator: -"
292        ]
293        
294        # Inject YAML metadata
295        for key, value in self.metadata.items():
296            if key.lower() != "title":
297                header_lines.append(f":{key}: {value}")
298        
299        header_lines.append(f":revdate: {today}")
300        
301        # Add global Antora headers from config
302        antora_cfg = self.config.get("antora", {})
303        header_lines.extend(antora_cfg.get("headers", []))
304        header_block = "\n".join(header_lines) + "\n\n"
305
306        # --- 4. MARKER RESTORATION & CLEANUP ---
307        # Restore JSON Components
308        if hasattr(self, 'protected_json'):
309            for i, original in enumerate(self.protected_json):
310                content = content.replace(f"JSONP{i}PROTECT", original)
311
312        # Clean Pandoc artifacts
313        content = content.replace("++_++", "_").replace("++{++", "{").replace("++}++", "}")
314        content = content.replace("++{{++", "{{").replace("++}}++", "}}")
315        content = content.replace("++<++", "<").replace("++>++", ">")
316        content = content.replace("++*++", "*").replace("++_++", "_")
317
318        # Restore video embeds
319        content = re.sub(r'VIDEOTOKEN([a-zA-Z0-9_-]+)', r'video::\1[youtube]', content)
320        content = content.replace('’', "'").replace('‘', "'").replace('“', '"').replace('”', '"').replace('…', '...')
321
322        # Apply cleanup regex from config
323        cleanup = self.conv_cfg.get("cleanup_regex", [])
324        for c in cleanup:
325            flags = re.M if c.get("flags") == "M" else 0
326            regex = c.get("regex")
327            replacement = c.get("replacement")
328            
329            if c.get("hook") == "uppercase_label":
330                def uppercase_hook(m: Match) -> str:
331                    return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n===="
332                content = re.sub(regex, uppercase_hook, content, flags=flags)
333            else:
334                content = re.sub(regex, replacement, content, flags=flags)
335
336        # --- 5. ADMONITION PROMOTION ---
337        def promote_admo(m: Match) -> str:
338            return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n===="
339        
340        content = re.sub(r'(?i)^\*?(Note|Warning|Tip|Caution|Important|IMPORTANT)[:]?\*?[:]?\s+(.*)$', promote_admo, content, flags=re.M)
341
342        # --- 6. DYNAMIC RESTORATIONS ---
343        restorations = self.conv_cfg.get("restoration_patterns", [])
344        for r in restorations:
345            regex, replacement = r.get("regex"), r.get("replacement")
346            if r.get("hook") == "restore_spaces":
347                def restore_hook(m: Match) -> str:
348                    full_block = m.group(1)
349                    parts = full_block.split("SHIELDSEP", 1) if "SHIELDSEP" in full_block else full_block.split("\n", 1)
350                    title = parts[0].replace('PROTECTSPACE', ' ').strip()
351                    body = parts[1].strip() if len(parts) > 1 else ""
352                    return f".{title}\n[%collapsible]\n======\n{body}\n======"
353                content = re.sub(regex, restore_hook, content, flags=re.S)
354            else:
355                mapping = r.get("map")
356                if mapping:
357                    for key, val in mapping.items():
358                        content = re.sub(regex.replace("{key}", key), replacement.replace("{val}", val), content, flags=re.S)
359                else:
360                    content = re.sub(regex, replacement, content, flags=re.S)
361
362        # --- 7. ANTORA NORMALIZATION (Xrefs & Images) ---
363        # Image path cleanup: Strip /images/ and ensure double colon '::' for block images
364        content = re.sub(r'image:/?images/(.*?)\[', r'image::\1[', content)
365
366        # Ensure scaling is applied even if the path was already clean
367        content = re.sub(r'image::([^\[]+)\[\]', r'image::\1[pdfwidth=100%,scalewidth=100%]', content)
368        
369        # Antora Xref Normalization: Convert Pandoc's link/xref syntax into clean Antora xrefs.
370        def antora_xref_logic(m: Match) -> str:
371            raw_path = m.group(1) or ""
372            anchor = m.group(2) or ""
373            path = raw_path.replace(".md", "").replace(".adoc", "").replace("./", "").strip("/")
374            if "inputs/" in path:
375                path = path.split("inputs/")[-1]
376            if path:
377                path = f"{path}.adoc"
378            cl_anchor = ""
379            if anchor:
380                cl_anchor = "#" + anchor.replace("#", "").lower()
381            return f"xref:{path}{cl_anchor}"
382
383        content = re.sub(r'(?:link:|xref:)(?:\+\+)?([^\[\s#\+]+)?(#[^\[\s\+]+)?(?:\+\+)?', antora_xref_logic, content)
384
385        # --- 8. FINAL CLEANUP ---
386        # Ensure a blank line before lists (both * and .) to prevent squashing
387        # Matches a non-newline character followed by a single newline and a list marker
388        # content = re.sub(r'([^\n])\n([*.])\s', r'\1\n\n\2 ', content)
389
390        content = re.sub(r'\[source,mermaid\]\n----(.*?)----', r'[mermaid]\n....\1....', content, flags=re.DOTALL)
391        content = content.replace("SHIELDADMONSTARTtabs", "[tabs]\n====")
392        content = content.replace("SHIELDADMONEND", "====")
393        content = re.sub(r'^@tab\s+(.*)$', r'\1::', content, flags=re.M)
394
395        return header_block + content.strip()
396    
397    def convert_file(self, input_path: Path, output_path: Path) -> None:
398        """
399        Orchestrates the conversion of a single Markdown file to AsciiDoc.
400        
401        Args:
402            input_path (Path): Source Markdown file.
403            output_path (Path): Destination for the raw AsciiDoc.
404        
405        Returns:
406            None: Writes the converted content to output_path.
407        """
408        self.metadata = {}
409        self.discovered_title = None
410        
411        raw_md = input_path.read_text(encoding='utf-8')
412        ready_md = self.pre_process_markdown(raw_md)
413        
414        # We write to a temporary file so Pandoc sees the 'shielded' version
415        temp_md = input_path.with_suffix('.tmp.md')
416        temp_md.write_text(ready_md, encoding='utf-8')
417        
418        try:
419            # Execute Pandoc CLI
420            subprocess.run(
421                [
422                    "pandoc", 
423                    "-f", "markdown-smart", 
424                    "-t", "asciidoc", 
425                    "--shift-heading-level-by=-1",
426                    "--wrap=none", 
427                    "-o", str(output_path), 
428                    str(temp_md)
429                ], 
430                check=True, 
431                capture_output=True
432            )
433            
434            # Post-process the Pandoc result to restore shields and finalize headers
435            final_adoc = self.post_process_asciidoc(output_path.read_text(encoding='utf-8'))
436            output_path.write_text(final_adoc, encoding='utf-8')
437        finally:
438            # Tidy up transient files
439            if temp_md.exists(): 
440                temp_md.unlink()

A pattern-driven engine that transforms Markdown into Enterprise AsciiDoc.

Attributes:
  • config (Dict): Configuration extracted from pyproject.toml.
  • metadata (Dict): Extracted frontmatter (YAML) from the source file.
  • discovered_title (str): The inferred document title (H1 or YAML).
DocConverter(config_path: Optional[pathlib.Path] = None)
32    def __init__(self, config_path: Optional[Path] = None):
33        """Initializes the converter and loads conversion patterns."""
34        self.config_path = config_path or Path("pyproject.toml")
35        self.config = self._load_project_config()
36        self.conv_cfg = self.config.get("conversions", {})
37        self.metadata: Dict[str, Any] = {}
38        self.discovered_title = None
39        self.used_ids: Set[str] = set()
40        self.protected_json: List[str] = []
41        # --- SUSE Branding Attribute Map ---
42        # Format: "Raw Text to Find": "{attribute-variable-name}"
43        # ORDER MATTERS: Longest strings first to prevent partial matching.
44        self.attribute_map = {
45            "SUSE® Rancher Prime: Admission Policy Manager": "{kubewarden-product-name}",
46            "SUSE® Rancher Prime: Continuous Delivery": "{fleet-product-name}",
47            "SUSE® Rancher Prime: OS Manager": "{elemental-product-name}",
48            "SUSE® Rancher Prime: Cluster API": "{turtles-product-name}",
49            "SUSE® Rancher Prime: K3s": "{k3s-product-name}",
50            "SUSE® Rancher Prime: RKE2": "{rke2-product-name}",
51            "SUSE® Rancher Prime": "{rancher-product-name-tm}",
52            "SUSE Rancher Prime": "{rancher-product-name}",
53            "SUSE® Virtualization": "{harvester-product-name-tm}",
54            "SUSE Virtualization": "{harvester-product-name}",
55            "SUSE® Storage": "{longhorn-product-name-tm}",
56            "SUSE Storage": "{longhorn-product-name}",
57            "SUSE® Security": "{neuvector-product-name}",
58            "SUSE® Losant": "{losant-product-name}",
59            "SUSE Losant": "{losant-product-name}"
60        }

Initializes the converter and loads conversion patterns.

config_path
config
conv_cfg
metadata: Dict[str, Any]
discovered_title
used_ids: Set[str]
protected_json: List[str]
attribute_map
def pre_process_markdown(self, content: str) -> str:
135    def pre_process_markdown(self, content: str) -> str:
136        """
137        Prepares Markdown for Pandoc by shielding modern syntax and extracting metadata.
138
139        Args:
140            content (str): Raw Markdown string.
141
142        Returns:
143            str: "Shielded" Markdown ready for Pandoc.
144        """
145        self.metadata = {}
146        self.discovered_title = None
147        self.used_ids = set()
148        # Initialize storage for JSON components to protect them from Pandoc
149        self.protected_json = []
150
151        # --- 1. CODE BLOCK SHIELDING ---
152        # We protect '#' characters inside code blocks so the Title Scavenger 
153        # doesn't accidentally treat a code comment as the document's H1 title.
154        content = re.sub(r'(`{3}.*?`{3})', lambda m: m.group(1).replace('#', 'HASHSHIELD'), content, flags=re.DOTALL)
155
156        # --- 2. FRONTMATTER EXTRACTION ---
157        # Extracts YAML metadata (title, description, etc.) from the top of the MD file.
158        frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
159        if frontmatter_match:
160            try:
161                yaml_data = yaml.safe_load(frontmatter_match.group(1))
162                if isinstance(yaml_data, dict):
163                    self.metadata = yaml_data
164                content = content[frontmatter_match.end():]
165            except Exception:
166                self.metadata = {}
167
168        # --- 3. TITLE SCAVENGER ---
169        # Logic: Priority 1 is YAML 'title'. Priority 2 is the first H1 (#) found.
170        self.discovered_title = self.metadata.get('title')
171        if not self.discovered_title:
172            h1_match = re.search(r'^#\s+(.*)$', content, re.M)
173            if h1_match:
174                self.discovered_title = h1_match.group(1).strip()
175                # Remove the H1 from body as it will be promoted to the AsciiDoc Document Title (=).
176                content = content.replace(h1_match.group(0), "", 1)
177
178        # Restore shielded hashes after title scavenging is safe.
179        content = content.replace('HASHSHIELD', '#')
180
181        # --- 4. VIDEO & COMPLEX PATTERN SHIELDING ---
182        # Replaces complex HTML/Markdown blocks with tokens that Pandoc will ignore.
183        # This protects <iframe> embeds and custom ':::tabs' blocks.
184        content = re.sub(r'<iframe.*?embed/([^"?\s]+).*?</iframe>', r'VIDEOTOKEN\1', content)
185
186        patterns = self.conv_cfg.get("shielding_patterns", [])
187        for p in patterns:
188            regex = p.get("regex")
189            replacement = p.get("replacement")
190            
191            if p.get("hook") == "protect_spaces":
192                # Special hook for collapsibles to ensure spaces in titles aren't lost.
193                def protect_hook(match: Match) -> str:
194                    title = match.group(1).strip().replace(' ', 'PROTECTSPACE')
195                    body = match.group(2).strip()
196                    return replacement.replace(r"\1", title).replace(r"\2", body)
197                content = re.sub(regex, protect_hook, content, flags=re.S)
198            else:
199                content = re.sub(regex, replacement, content, flags=re.S)
200        
201        # --- JSON COMPONENT SHIELDING ---
202        # Protects <JsonDisplay /> from being mangled into latexmath/footnotes by Pandoc
203        def shield_json_display(match):
204            # Pure alphanumeric placeholder to avoid Pandoc escaping
205            placeholder = f"JSONP{len(self.protected_json)}PROTECT"
206            self.protected_json.append(match.group(1))
207            return placeholder
208
209        content = re.sub(r'(<JsonDisplay.*?\/>)', shield_json_display, content, flags=re.DOTALL)
210
211        # Protect existing Markdown IDs so Pandoc doesn't mangle curly braces
212        content = re.sub(r'\{#(.*?)\}', r'IDSHIELDSTART\1IDSHIELDEND', content)
213
214        return content

Prepares Markdown for Pandoc by shielding modern syntax and extracting metadata.

Arguments:
  • content (str): Raw Markdown string.
Returns:

str: "Shielded" Markdown ready for Pandoc.

def post_process_asciidoc(self, content: str) -> str:
216    def post_process_asciidoc(self, content: str) -> str:
217        """
218        Finalizes the AsciiDoc output after Pandoc has finished.
219        
220        Order of Operations:
221        1. Reset ID tracker and prioritize H1 Document Title.
222        2. Process H2-H6 headings with collision avoidance.
223        3. Apply Global Branding Attributes to body prose.
224        4. Construct the Metadata Header.
225        5. Restore shielded blocks and clean Pandoc noise.
226        6. Apply Antora-specific normalization (Xrefs & Image Scaling).
227
228        Args:
229            content (str): The raw AsciiDoc output from Pandoc.
230        
231        Returns:
232            str: The finalized AsciiDoc content ready for Antora.
233        """
234        # --- 1. INITIALIZATION & H1 PRIORITY ---
235        self.used_ids = set() 
236        today = datetime.now().strftime("%Y-%m-%d")
237
238        # A. Create the ID Slug from the RAW title BEFORE branding
239        # This prevents ID leakage (e.g., [#suse-storage] instead of [#{longhorn-product-name}])
240        title_slug = self._slugify(self.discovered_title or "untitled")
241
242        # B. Prepare the title for DISPLAY only
243        title_text = self.discovered_title or "Untitled Document"
244        title_text = self._apply_global_attributes(title_text)
245
246        # --- 2. HEADING SLUGGING & NORMALIZATION ---
247        def heading_anchor_logic(match):
248            level_chars = match.group(1)
249            raw_title = match.group(2).strip()
250            
251            # Check for shielded custom IDs from Markdown (e.g., {#my-custom-id})
252            custom_id_match = re.search(r'IDSHIELDSTART(.*?)IDSHIELDEND', raw_title)
253            
254            if custom_id_match:
255                base_id = custom_id_match.group(1)
256                display_title = raw_title.replace(custom_id_match.group(0), "").strip()
257                
258                # Collision avoidance even for custom IDs
259                final_id = base_id
260                counter = 1
261                while final_id in self.used_ids:
262                    final_id = f"{base_id}-{counter}"
263                    counter += 1
264                self.used_ids.add(final_id)
265                # Apply branding to display title after ID is locked
266                display_title = self._apply_global_attributes(display_title)
267            else:
268                # Regular heading: slugify RAW title first for clean SEO
269                final_id = self._slugify(raw_title)
270                # Then brand the display title for the reader
271                display_title = self._apply_global_attributes(raw_title)
272
273            # We return the heading with an explicit anchor to ensure URL stability, even if the title text changes in the future.            
274            return f"\n[#{final_id}]\n{level_chars} {display_title}"
275
276        # Transform H2-H6 levels (Pandoc's == syntax)
277        content = re.sub(r'\n(={2,6})\s+(.*)', heading_anchor_logic, content)
278        
279        # Heading cleanup
280        content = re.sub(r'IDSHIELDSTART.*?IDSHIELDEND', '', content)
281        content = re.sub(r'^={6,}\s+', r'===== ', content, flags=re.M)
282
283        # C. Apply Global Attributes to the body content AFTER headings are locked
284        content = self._apply_global_attributes(content)
285
286        # --- 3. CONSTRUCT HEADER BLOCK ---
287        header_lines = [
288            f"[#{title_slug}]",
289            f"= {title_text}",
290            ":idprefix:",
291            ":idseparator: -"
292        ]
293        
294        # Inject YAML metadata
295        for key, value in self.metadata.items():
296            if key.lower() != "title":
297                header_lines.append(f":{key}: {value}")
298        
299        header_lines.append(f":revdate: {today}")
300        
301        # Add global Antora headers from config
302        antora_cfg = self.config.get("antora", {})
303        header_lines.extend(antora_cfg.get("headers", []))
304        header_block = "\n".join(header_lines) + "\n\n"
305
306        # --- 4. MARKER RESTORATION & CLEANUP ---
307        # Restore JSON Components
308        if hasattr(self, 'protected_json'):
309            for i, original in enumerate(self.protected_json):
310                content = content.replace(f"JSONP{i}PROTECT", original)
311
312        # Clean Pandoc artifacts
313        content = content.replace("++_++", "_").replace("++{++", "{").replace("++}++", "}")
314        content = content.replace("++{{++", "{{").replace("++}}++", "}}")
315        content = content.replace("++<++", "<").replace("++>++", ">")
316        content = content.replace("++*++", "*").replace("++_++", "_")
317
318        # Restore video embeds
319        content = re.sub(r'VIDEOTOKEN([a-zA-Z0-9_-]+)', r'video::\1[youtube]', content)
320        content = content.replace('’', "'").replace('‘', "'").replace('“', '"').replace('”', '"').replace('…', '...')
321
322        # Apply cleanup regex from config
323        cleanup = self.conv_cfg.get("cleanup_regex", [])
324        for c in cleanup:
325            flags = re.M if c.get("flags") == "M" else 0
326            regex = c.get("regex")
327            replacement = c.get("replacement")
328            
329            if c.get("hook") == "uppercase_label":
330                def uppercase_hook(m: Match) -> str:
331                    return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n===="
332                content = re.sub(regex, uppercase_hook, content, flags=flags)
333            else:
334                content = re.sub(regex, replacement, content, flags=flags)
335
336        # --- 5. ADMONITION PROMOTION ---
337        def promote_admo(m: Match) -> str:
338            return f"[{m.group(1).upper()}]\n====\n{m.group(2).strip()}\n===="
339        
340        content = re.sub(r'(?i)^\*?(Note|Warning|Tip|Caution|Important|IMPORTANT)[:]?\*?[:]?\s+(.*)$', promote_admo, content, flags=re.M)
341
342        # --- 6. DYNAMIC RESTORATIONS ---
343        restorations = self.conv_cfg.get("restoration_patterns", [])
344        for r in restorations:
345            regex, replacement = r.get("regex"), r.get("replacement")
346            if r.get("hook") == "restore_spaces":
347                def restore_hook(m: Match) -> str:
348                    full_block = m.group(1)
349                    parts = full_block.split("SHIELDSEP", 1) if "SHIELDSEP" in full_block else full_block.split("\n", 1)
350                    title = parts[0].replace('PROTECTSPACE', ' ').strip()
351                    body = parts[1].strip() if len(parts) > 1 else ""
352                    return f".{title}\n[%collapsible]\n======\n{body}\n======"
353                content = re.sub(regex, restore_hook, content, flags=re.S)
354            else:
355                mapping = r.get("map")
356                if mapping:
357                    for key, val in mapping.items():
358                        content = re.sub(regex.replace("{key}", key), replacement.replace("{val}", val), content, flags=re.S)
359                else:
360                    content = re.sub(regex, replacement, content, flags=re.S)
361
362        # --- 7. ANTORA NORMALIZATION (Xrefs & Images) ---
363        # Image path cleanup: Strip /images/ and ensure double colon '::' for block images
364        content = re.sub(r'image:/?images/(.*?)\[', r'image::\1[', content)
365
366        # Ensure scaling is applied even if the path was already clean
367        content = re.sub(r'image::([^\[]+)\[\]', r'image::\1[pdfwidth=100%,scalewidth=100%]', content)
368        
369        # Antora Xref Normalization: Convert Pandoc's link/xref syntax into clean Antora xrefs.
370        def antora_xref_logic(m: Match) -> str:
371            raw_path = m.group(1) or ""
372            anchor = m.group(2) or ""
373            path = raw_path.replace(".md", "").replace(".adoc", "").replace("./", "").strip("/")
374            if "inputs/" in path:
375                path = path.split("inputs/")[-1]
376            if path:
377                path = f"{path}.adoc"
378            cl_anchor = ""
379            if anchor:
380                cl_anchor = "#" + anchor.replace("#", "").lower()
381            return f"xref:{path}{cl_anchor}"
382
383        content = re.sub(r'(?:link:|xref:)(?:\+\+)?([^\[\s#\+]+)?(#[^\[\s\+]+)?(?:\+\+)?', antora_xref_logic, content)
384
385        # --- 8. FINAL CLEANUP ---
386        # Ensure a blank line before lists (both * and .) to prevent squashing
387        # Matches a non-newline character followed by a single newline and a list marker
388        # content = re.sub(r'([^\n])\n([*.])\s', r'\1\n\n\2 ', content)
389
390        content = re.sub(r'\[source,mermaid\]\n----(.*?)----', r'[mermaid]\n....\1....', content, flags=re.DOTALL)
391        content = content.replace("SHIELDADMONSTARTtabs", "[tabs]\n====")
392        content = content.replace("SHIELDADMONEND", "====")
393        content = re.sub(r'^@tab\s+(.*)$', r'\1::', content, flags=re.M)
394
395        return header_block + content.strip()

Finalizes the AsciiDoc output after Pandoc has finished.

Order of Operations:

  1. Reset ID tracker and prioritize H1 Document Title.
  2. Process H2-H6 headings with collision avoidance.
  3. Apply Global Branding Attributes to body prose.
  4. Construct the Metadata Header.
  5. Restore shielded blocks and clean Pandoc noise.
  6. Apply Antora-specific normalization (Xrefs & Image Scaling).
Arguments:
  • content (str): The raw AsciiDoc output from Pandoc.
Returns:

str: The finalized AsciiDoc content ready for Antora.

def convert_file(self, input_path: pathlib.Path, output_path: pathlib.Path) -> None:
397    def convert_file(self, input_path: Path, output_path: Path) -> None:
398        """
399        Orchestrates the conversion of a single Markdown file to AsciiDoc.
400        
401        Args:
402            input_path (Path): Source Markdown file.
403            output_path (Path): Destination for the raw AsciiDoc.
404        
405        Returns:
406            None: Writes the converted content to output_path.
407        """
408        self.metadata = {}
409        self.discovered_title = None
410        
411        raw_md = input_path.read_text(encoding='utf-8')
412        ready_md = self.pre_process_markdown(raw_md)
413        
414        # We write to a temporary file so Pandoc sees the 'shielded' version
415        temp_md = input_path.with_suffix('.tmp.md')
416        temp_md.write_text(ready_md, encoding='utf-8')
417        
418        try:
419            # Execute Pandoc CLI
420            subprocess.run(
421                [
422                    "pandoc", 
423                    "-f", "markdown-smart", 
424                    "-t", "asciidoc", 
425                    "--shift-heading-level-by=-1",
426                    "--wrap=none", 
427                    "-o", str(output_path), 
428                    str(temp_md)
429                ], 
430                check=True, 
431                capture_output=True
432            )
433            
434            # Post-process the Pandoc result to restore shields and finalize headers
435            final_adoc = self.post_process_asciidoc(output_path.read_text(encoding='utf-8'))
436            output_path.write_text(final_adoc, encoding='utf-8')
437        finally:
438            # Tidy up transient files
439            if temp_md.exists(): 
440                temp_md.unlink()

Orchestrates the conversion of a single Markdown file to AsciiDoc.

Arguments:
  • input_path (Path): Source Markdown file.
  • output_path (Path): Destination for the raw AsciiDoc.
Returns:

None: Writes the converted content to output_path.

class StyleLinter:
 31class StyleLinter:
 32    """
 33    Orchestrates linguistic and style validation using Vale.
 34
 35    Attributes:
 36        target_path (Path): The specific file (adoc/md) to be scanned.
 37        config_path (Path): Path to the project's pyproject.toml.
 38        vale_ini (Path): The path where the temporary .vale.ini will be created.
 39        config (Dict): Loaded configuration specific to the transpiler-pro tool.
 40    """
 41
 42    def __init__(self, target_path: Path, config_path: Optional[Path] = None):
 43        """
 44        Initializes the linter and prepares the configuration environment.
 45        
 46        Args:
 47            target_path (Path): File to be validated.
 48            config_path (Path, optional): Path to pyproject.toml. Defaults to root.
 49        """
 50        self.target_path = target_path
 51        self.config_path = config_path or Path("pyproject.toml")
 52        
 53        # We generate the .vale.ini in the same directory as the config for context isolation.
 54        self.vale_ini: Path = self.config_path.parent / ".vale.ini"
 55        
 56        self.config = self._load_project_config()
 57
 58    def _load_project_config(self) -> Dict[str, Any]:
 59        """Loads linter-specific settings from the [tool.transpiler-pro] section."""
 60        if not self.config_path.exists():
 61            return {}
 62        try:
 63            with open(self.config_path, "rb") as f:
 64                return tomllib.load(f).get("tool", {}).get("transpiler-pro", {})
 65        except Exception as e:
 66            console.print(f"[bold red]Error loading linter config:[/] {e}")
 67            return {}
 68
 69    def setup_config(self) -> None:
 70        """
 71        Generates a temporary `.vale.ini` file required by the Vale CLI.
 72        
 73        This method performs two key tasks:
 74        1. Dynamic Vocab: Reads 'technical_terms' from the Knowledge Base and 
 75           writes them to a Vale 'accept.txt' file so they are ignored by 
 76           spelling checks.
 77        2. Config Generation: Injects style paths, alert levels, and rule-sets 
 78           defined in pyproject.toml into the INI format.
 79        """
 80        linter_cfg = self.config.get("linter", {})
 81        # Ensure paths use forward slashes for cross-platform compatibility in Vale
 82        styles_root = str(STYLES_DIR.resolve()).replace("\\", "/")
 83        
 84        # --- PHASE 1: DYNAMIC VOCABULARY INJECTION ---
 85        kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json")
 86        kb_path = Path(kb_setting)
 87        vocab_setting = ""
 88        
 89        if kb_path.exists():
 90            try:
 91                kb_data = json.loads(kb_path.read_text(encoding="utf-8"))
 92                tech_terms = kb_data.get("technical_terms", [])
 93                
 94                if tech_terms:
 95                    # Vale expects a specific folder structure for Vocabularies
 96                    vocab_dir = STYLES_DIR / "vocabularies" / "Project"
 97                    vocab_dir.mkdir(parents=True, exist_ok=True)
 98                    accept_file = vocab_dir / "accept.txt"
 99                    
100                    # Store terms in the accepted list
101                    accept_file.write_text("\n".join(tech_terms), encoding="utf-8")
102                    vocab_setting = "Vocab = Project"
103            except Exception as e:
104                console.print(f"[yellow]⚠️ Warning:[/] Vocabulary injection failed: {e}")
105
106        # --- PHASE 2: INI CONSTRUCTION ---
107        styles = linter_cfg.get("styles", ["Vale", "common", "asciidoc"])
108        styles_str = ", ".join(styles)
109        min_level = linter_cfg.get("min_alert_level", "suggestion")
110
111        # Construct the Vale configuration string
112        config_raw = f"""
113        StylesPath = {styles_root}
114        MinAlertLevel = {min_level}
115        {vocab_setting}
116
117        [*.{{adoc,md}}]
118        BasedOnStyles = {styles_str}
119        
120        # Use the Asciidoctor parser for accurate block identification
121        asciidoctor = true
122        """
123        
124        self.vale_ini.write_text(textwrap.dedent(config_raw).strip())
125
126    def _extract_suggestion(self, issue: Dict[str, Any]) -> str:
127        """
128        Extracts a viable repair suggestion from a Vale violation.
129        
130        Vale reports often include 'Action' parameters (e.g., the correct 
131        spelling). If those aren't available, this method uses regex patterns 
132        from pyproject.toml to "scrape" the suggestion out of the error message.
133        """
134        action_params = issue.get("Action", {}).get("Params", [])
135        patterns_cfg = self.config.get("patterns", {})
136        ignored = patterns_cfg.get("ignored_placeholders", [])
137        
138        # Priority 1: Check Vale's native suggestion parameters
139        if action_params:
140            candidate = str(action_params[0])
141            if candidate not in ignored:
142                return candidate
143
144        # Priority 2: Scrape suggestions from the Message text using Regex
145        # e.g., Message: "Use 'SUSE' instead of 'suse'" -> Extracts 'SUSE'
146        search_pool = issue.get("Description", "") + " " + issue.get("Message", "")
147        pattern = patterns_cfg.get("suggestion_extraction", r"['\"‘“’](.*?)['\"’]")
148        
149        if pattern and search_pool.strip():
150            match = re.search(pattern, search_pool)
151            if match:
152                return match.group(1)
153        
154        return ""
155
156    def run(self) -> Dict[str, List[Dict[str, Any]]]:
157        """
158        Executes the Vale CLI and returns a structured map of findings.
159        
160        Returns:
161            Dict: Key is file path, Value is a list of violation dictionaries 
162                  containing Line, Check ID, Severity, and Suggestion.
163        """
164        try:
165            abs_target = str(self.target_path.resolve())
166            
167            # Execute Vale in JSON mode for programmatic parsing
168            result = subprocess.run(
169                ["vale", "--config", str(self.vale_ini.resolve()), "--output=JSON", abs_target],
170                capture_output=True,
171                text=True,
172                check=False
173            )
174            
175            if not result.stdout or result.stdout.strip() == "":
176                return {}
177
178            raw_data = json.loads(result.stdout)
179            processed_findings = {}
180
181            # Convert raw Vale schema to Transpiler-Pro's internal repair schema
182            for file_path, file_issues in raw_data.items():
183                processed_findings[file_path] = []
184                for issue in file_issues:
185                    processed_findings[file_path].append({
186                        "Line": issue.get("Line"),
187                        "Check": issue.get("Check"),
188                        "Severity": issue.get("Severity"),
189                        "Message": issue.get("Message"),
190                        "Description": issue.get("Description", ""),
191                        "Suggestion": self._extract_suggestion(issue)
192                    })
193                
194            return processed_findings
195
196        except (FileNotFoundError, json.JSONDecodeError, subprocess.SubprocessError) as e:
197            console.print(f"[bold red]Linter Execution Error:[/] {e}")
198            return {}
199
200    def display_report(self, data: Dict[str, List[Dict[str, Any]]]) -> None:
201        """
202        Renders a user-friendly report of the findings.
203        The actual visual table is commented out to allow CLI orchestration 
204        to handle final output density, but the logic remains for debugging.
205        """
206        if not data or not any(data.values()):
207            console.print("\n✨ [bold green]Quality Check Passed: Document meets all style guide requirements.[/]")
208            return
209
210        # Table rendering code...
211        # Theme-based coloring for different alert levels
212        # linter_cfg = self.config.get("linter", {})
213        # theme = linter_cfg.get("theme", {"error": "red", "warning": "yellow", "suggestion": "blue"})
214        # table = Table(title="Style Guide Validation Report", title_style="bold cyan")
215        # table.add_column("Line", style="magenta", justify="right")
216        # table.add_column("Severity", style="bold")
217        # table.add_column("Message", style="white")
218        # table.add_column("Rule ID", style="yellow")
219
220        # for _, issues in data.items():
221        #     for issue in issues:
222        #         sev = issue['Severity']
223        #         color = theme.get(sev.lower(), "white")
224                
225        #         table.add_row(
226        #             str(issue['Line']),
227        #             f"[{color}]{sev}[/]",
228        #             issue['Message'],
229        #             issue['Check']
230        #         )
231
232        # console.print(table)

Orchestrates linguistic and style validation using Vale.

Attributes:
  • target_path (Path): The specific file (adoc/md) to be scanned.
  • config_path (Path): Path to the project's pyproject.toml.
  • vale_ini (Path): The path where the temporary .vale.ini will be created.
  • config (Dict): Loaded configuration specific to the transpiler-pro tool.
StyleLinter( target_path: pathlib.Path, config_path: Optional[pathlib.Path] = None)
42    def __init__(self, target_path: Path, config_path: Optional[Path] = None):
43        """
44        Initializes the linter and prepares the configuration environment.
45        
46        Args:
47            target_path (Path): File to be validated.
48            config_path (Path, optional): Path to pyproject.toml. Defaults to root.
49        """
50        self.target_path = target_path
51        self.config_path = config_path or Path("pyproject.toml")
52        
53        # We generate the .vale.ini in the same directory as the config for context isolation.
54        self.vale_ini: Path = self.config_path.parent / ".vale.ini"
55        
56        self.config = self._load_project_config()

Initializes the linter and prepares the configuration environment.

Arguments:
  • target_path (Path): File to be validated.
  • config_path (Path, optional): Path to pyproject.toml. Defaults to root.
target_path
config_path
vale_ini: pathlib.Path
config
def setup_config(self) -> None:
 69    def setup_config(self) -> None:
 70        """
 71        Generates a temporary `.vale.ini` file required by the Vale CLI.
 72        
 73        This method performs two key tasks:
 74        1. Dynamic Vocab: Reads 'technical_terms' from the Knowledge Base and 
 75           writes them to a Vale 'accept.txt' file so they are ignored by 
 76           spelling checks.
 77        2. Config Generation: Injects style paths, alert levels, and rule-sets 
 78           defined in pyproject.toml into the INI format.
 79        """
 80        linter_cfg = self.config.get("linter", {})
 81        # Ensure paths use forward slashes for cross-platform compatibility in Vale
 82        styles_root = str(STYLES_DIR.resolve()).replace("\\", "/")
 83        
 84        # --- PHASE 1: DYNAMIC VOCABULARY INJECTION ---
 85        kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json")
 86        kb_path = Path(kb_setting)
 87        vocab_setting = ""
 88        
 89        if kb_path.exists():
 90            try:
 91                kb_data = json.loads(kb_path.read_text(encoding="utf-8"))
 92                tech_terms = kb_data.get("technical_terms", [])
 93                
 94                if tech_terms:
 95                    # Vale expects a specific folder structure for Vocabularies
 96                    vocab_dir = STYLES_DIR / "vocabularies" / "Project"
 97                    vocab_dir.mkdir(parents=True, exist_ok=True)
 98                    accept_file = vocab_dir / "accept.txt"
 99                    
100                    # Store terms in the accepted list
101                    accept_file.write_text("\n".join(tech_terms), encoding="utf-8")
102                    vocab_setting = "Vocab = Project"
103            except Exception as e:
104                console.print(f"[yellow]⚠️ Warning:[/] Vocabulary injection failed: {e}")
105
106        # --- PHASE 2: INI CONSTRUCTION ---
107        styles = linter_cfg.get("styles", ["Vale", "common", "asciidoc"])
108        styles_str = ", ".join(styles)
109        min_level = linter_cfg.get("min_alert_level", "suggestion")
110
111        # Construct the Vale configuration string
112        config_raw = f"""
113        StylesPath = {styles_root}
114        MinAlertLevel = {min_level}
115        {vocab_setting}
116
117        [*.{{adoc,md}}]
118        BasedOnStyles = {styles_str}
119        
120        # Use the Asciidoctor parser for accurate block identification
121        asciidoctor = true
122        """
123        
124        self.vale_ini.write_text(textwrap.dedent(config_raw).strip())

Generates a temporary .vale.ini file required by the Vale CLI.

This method performs two key tasks:

  1. Dynamic Vocab: Reads 'technical_terms' from the Knowledge Base and writes them to a Vale 'accept.txt' file so they are ignored by spelling checks.
  2. Config Generation: Injects style paths, alert levels, and rule-sets defined in pyproject.toml into the INI format.
def run(self) -> Dict[str, List[Dict[str, Any]]]:
156    def run(self) -> Dict[str, List[Dict[str, Any]]]:
157        """
158        Executes the Vale CLI and returns a structured map of findings.
159        
160        Returns:
161            Dict: Key is file path, Value is a list of violation dictionaries 
162                  containing Line, Check ID, Severity, and Suggestion.
163        """
164        try:
165            abs_target = str(self.target_path.resolve())
166            
167            # Execute Vale in JSON mode for programmatic parsing
168            result = subprocess.run(
169                ["vale", "--config", str(self.vale_ini.resolve()), "--output=JSON", abs_target],
170                capture_output=True,
171                text=True,
172                check=False
173            )
174            
175            if not result.stdout or result.stdout.strip() == "":
176                return {}
177
178            raw_data = json.loads(result.stdout)
179            processed_findings = {}
180
181            # Convert raw Vale schema to Transpiler-Pro's internal repair schema
182            for file_path, file_issues in raw_data.items():
183                processed_findings[file_path] = []
184                for issue in file_issues:
185                    processed_findings[file_path].append({
186                        "Line": issue.get("Line"),
187                        "Check": issue.get("Check"),
188                        "Severity": issue.get("Severity"),
189                        "Message": issue.get("Message"),
190                        "Description": issue.get("Description", ""),
191                        "Suggestion": self._extract_suggestion(issue)
192                    })
193                
194            return processed_findings
195
196        except (FileNotFoundError, json.JSONDecodeError, subprocess.SubprocessError) as e:
197            console.print(f"[bold red]Linter Execution Error:[/] {e}")
198            return {}

Executes the Vale CLI and returns a structured map of findings.

Returns:

Dict: Key is file path, Value is a list of violation dictionaries containing Line, Check ID, Severity, and Suggestion.

def display_report(self, data: Dict[str, List[Dict[str, Any]]]) -> None:
200    def display_report(self, data: Dict[str, List[Dict[str, Any]]]) -> None:
201        """
202        Renders a user-friendly report of the findings.
203        The actual visual table is commented out to allow CLI orchestration 
204        to handle final output density, but the logic remains for debugging.
205        """
206        if not data or not any(data.values()):
207            console.print("\n✨ [bold green]Quality Check Passed: Document meets all style guide requirements.[/]")
208            return
209
210        # Table rendering code...
211        # Theme-based coloring for different alert levels
212        # linter_cfg = self.config.get("linter", {})
213        # theme = linter_cfg.get("theme", {"error": "red", "warning": "yellow", "suggestion": "blue"})
214        # table = Table(title="Style Guide Validation Report", title_style="bold cyan")
215        # table.add_column("Line", style="magenta", justify="right")
216        # table.add_column("Severity", style="bold")
217        # table.add_column("Message", style="white")
218        # table.add_column("Rule ID", style="yellow")
219
220        # for _, issues in data.items():
221        #     for issue in issues:
222        #         sev = issue['Severity']
223        #         color = theme.get(sev.lower(), "white")
224                
225        #         table.add_row(
226        #             str(issue['Line']),
227        #             f"[{color}]{sev}[/]",
228        #             issue['Message'],
229        #             issue['Check']
230        #         )
231
232        # console.print(table)

Renders a user-friendly report of the findings. The actual visual table is commented out to allow CLI orchestration to handle final output density, but the logic remains for debugging.

class StyleFixer:
 27class StyleFixer:
 28    """
 29    NLP-enhanced repair engine that learns and persists style corrections.
 30    
 31    Attributes:
 32        config (Dict): Tool configuration extracted from pyproject.toml.
 33        kb_path (Path): Location of the persistent JSON knowledge base.
 34        kb (Dict): The internal memory of the fixer (Branding + Learned terms).
 35        nlp: The spaCy language model used for linguistic context checks.
 36    """
 37
 38    def __init__(self, config_path: Optional[Path] = None) -> None:
 39        """
 40        Initializes the fixer and loads the persistent Knowledge Base.
 41        
 42        Args:
 43            config_path (Path, optional): Custom path to pyproject.toml.
 44        """
 45        self.config_path = config_path or Path("pyproject.toml")
 46        self.config = self._load_config()
 47        
 48        # Load the Knowledge Base (JSON) which stores branding and learned words.
 49        kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json")
 50        self.kb_path = Path(kb_setting)
 51        self.kb = self._load_kb()
 52
 53        try:
 54            self.nlp = spacy.load("en_core_web_sm")
 55        except Exception:
 56            # Fallback if spaCy is missing; some tense-shifting features may be limited.
 57            self.nlp = None
 58
 59    def _load_config(self) -> Dict[str, Any]:
 60        """Reads the [tool.transpiler-pro] section from the project TOML."""
 61        if not self.config_path.exists(): 
 62            return {}
 63        try:
 64            with open(self.config_path, "rb") as f:
 65                return tomllib.load(f).get("tool", {}).get("transpiler-pro", {})
 66        except (tomllib.TOMLDecodeError, OSError):
 67            return {}
 68
 69    def _load_kb(self) -> Dict[str, Any]:
 70        """Loads the JSON brain. Initializes empty branding/learned dicts if missing."""
 71        if self.kb_path.exists():
 72            try:
 73                return json.loads(self.kb_path.read_text(encoding="utf-8"))
 74            except (json.JSONDecodeError, OSError):
 75                pass
 76        return {"branding": {}, "learned": {}}
 77
 78    def _save_kb(self) -> None:
 79        """Persists learned corrections to disk for future pipeline runs."""
 80        try:
 81            self.kb_path.parent.mkdir(parents=True, exist_ok=True)
 82            self.kb_path.write_text(json.dumps(self.kb, indent=4), encoding="utf-8")
 83        except Exception as e:
 84            console.print(f"[red]Error saving Knowledge Base:[/] {e}")
 85
 86    def _get_progressive_verb(self, verb_token) -> str:
 87        """
 88        Logic to convert a verb to its '-ing' form.
 89        
 90        Prioritizes the 'special_verbs' table in pyproject.toml to handle 
 91        irregular conjugations (e.g., 'stop' -> 'stopping') before falling 
 92        back to standard English suffix rules.
 93        """
 94        lemma = verb_token.lemma_.lower()
 95        grammar_cfg = self.config.get("grammar", {})
 96        special = grammar_cfg.get("special_verbs", {})
 97        
 98        if lemma in special:
 99            return special[lemma]
100
101        # Standard -ing rules
102        if lemma.endswith("e") and not lemma.endswith("ee"):
103            return lemma[:-1] + "ing"
104        # CVC rule: Double the consonant (e.g., run -> running)
105        if len(lemma) > 2 and lemma[-1] not in "aeiou" and lemma[-2] in "aeiou" and lemma[-3] not in "aeiou":
106            return lemma + lemma[-1] + "ing"
107        return lemma + "ing"
108
109    def _fix_tense(self, line: str) -> str:
110        """
111        Standard Tense Shifter: "We will test" -> "We are testing".
112        Note: This is an legacy/alternative shifter; primary tense shifting 
113        is now handled by the more advanced LinguisticEngine in repair.py.
114        """
115        if not self.nlp: 
116            return line
117        doc = self.nlp(line)
118        working_line = line
119        for token in doc:
120            if token.text.lower() == "will":
121                main_verb = token.head
122                if main_verb.pos_ == "VERB":
123                    # Determine plurality for correct aux verb (is vs are)
124                    subjects = [w for w in main_verb.lefts if "subj" in w.dep_]
125                    is_plural = any("Number=Plur" in str(s.morph) or s.text.lower() in ["we", "they", "you"] for s in subjects)
126                    aux = "are" if is_plural else "is"
127                    prog = self._get_progressive_verb(main_verb)
128                    working_line = re.sub(rf"\b{token.text}\s+{main_verb.text}\b", f"{aux} {prog}", working_line, flags=re.IGNORECASE)
129        return working_line
130
131    def fix_file(self, file_path: Path, violations: List[Dict[str, Any]]) -> int:
132        """
133        The main repair loop. Iterates through line-specific violations and 
134        applies branding and style corrections.
135
136        Args:
137            file_path (Path): Path to the generated AsciiDoc file.
138            violations (List[Dict]): List of findings from the Linter.
139            
140        Returns:
141            int: Number of lines successfully modified.
142        """
143        if not file_path.exists(): 
144            return 0
145        content = file_path.read_text(encoding="utf-8").splitlines()
146        total_fixes = 0
147        
148        # Group issues by line number for efficient processing
149        line_map = defaultdict(list)
150        for v in violations: 
151            line_map[v.get("Line", 0)].append(v)
152
153        patterns = self.config.get("patterns", {})
154        extract_re = patterns.get("suggestion_extraction", r"'(.*?)'")
155        remove_trigger = patterns.get("removal_trigger", "removing")
156        instead_of_trigger = patterns.get("instead_of_trigger", "instead of")
157
158        # Current branding context (Permanent + Learned during this session)
159        session_branding = {**self.kb.get("learned", {}), **self.kb.get("automated_fixes", {})}
160
161        # Process lines in reverse order to ensure line-length changes don't shift indices
162        for line_num in sorted(line_map.keys(), reverse=True):
163            idx = line_num - 1
164            if idx < 0 or idx >= len(content): 
165                continue
166            
167            working_line = content[idx]
168            original_line = working_line
169
170            # --- PHASE 1: LINTER-DRIVEN REPAIRS ---
171            for issue in line_map[line_num]:
172                msg = issue.get("Message", "")
173                check_id = issue.get("Check", "")
174                suggestion = issue.get("Suggestion", "")
175
176                # 1. Branding Sync (e.g., Use 'SUSE' instead of 'suse')
177                for wrong, correct in session_branding.items():
178                    if f"'{wrong}'" in msg.lower() or f"‘{wrong}’" in msg.lower():
179                        working_line = re.sub(rf"\b{re.escape(wrong)}\b", correct, working_line, flags=re.IGNORECASE)
180
181                # 2. Surgical Removal (e.g., "Note that...", "Actually...")
182                if remove_trigger in msg.lower() or "Editorializing" in check_id:
183                    target = suggestion if suggestion else (re.findall(extract_re, msg)[0] if re.findall(extract_re, msg) else None)
184                    if target:
185                        working_line = re.sub(rf"\b{re.escape(target)}\b\s?", "", working_line, flags=re.IGNORECASE)
186
187                # 3. Phrasal Substitution (e.g., "Use 'X' instead of 'Y'")
188                elif instead_of_trigger in msg.lower():
189                    if suggestion:
190                        m = re.findall(extract_re, msg)
191                        wrong_term = m[1] if len(m) >= 2 else (m[0] if m else "")
192                        if wrong_term:
193                            # --- GUARDRAIL: Let repair.py handle complex tense shifts ---
194                            if "will" in wrong_term.lower() or "will" in msg.lower():
195                                continue
196                            working_line = re.sub(rf"\b{re.escape(wrong_term)}\b", suggestion, working_line, flags=re.IGNORECASE)
197
198                # 4. Auto-Learning: Capture spelling fixes into the Knowledge Base
199                elif "Spelling" in check_id:
200                    if suggestion and suggestion.lower() not in ["spelling", "spellings", "learned"]:
201                        match = re.findall(extract_re, msg)
202                        word_to_fix = match[0] if match else ""
203                        if word_to_fix:
204                            working_line = re.sub(rf"\b{re.escape(word_to_fix)}\b", suggestion, working_line)
205                            # Persist this correction for future automation
206                            if word_to_fix.lower() not in session_branding:
207                                self.kb["learned"][word_to_fix.lower()] = suggestion
208
209            # --- PHASE 2: GLOBAL BRANDING & FORMATTING GUARDRAILS ---
210            
211            # 1. Branding Guardrail: Apply core branding safely (no URL/Path corruption)
212            for wrong, correct in self.kb.get("automated_fixes", {}).items():
213                # Negative lookarounds (?<![\/-]) prevent breaking paths like /img/suse-logo.svg
214                pattern = rf"(?<![\/-])\b{re.escape(wrong)}\b(?![\/-])"
215                working_line = re.sub(pattern, correct, working_line, flags=re.IGNORECASE)
216
217            # 2. Fragment Healer: Ensure sentences start with capital letters
218            # Ignores lines starting with AsciiDoc technical syntax
219            if not re.match(r'^(image::|video::|xref:|link:|http|\[|:)', working_line, flags=re.IGNORECASE):
220                working_line = re.sub(r'(^|\.\s+)([a-z])', lambda m: m.group(1) + m.group(2).upper(), working_line)
221
222            # Update line in content if modifications were made
223            if working_line != original_line:
224                content[idx] = working_line
225                total_fixes += 1
226
227        # Write corrected content back and update the JSON brain
228        file_path.write_text("\n".join(content), encoding="utf-8")
229        self._save_kb() 
230        return total_fixes

NLP-enhanced repair engine that learns and persists style corrections.

Attributes:
  • config (Dict): Tool configuration extracted from pyproject.toml.
  • kb_path (Path): Location of the persistent JSON knowledge base.
  • kb (Dict): The internal memory of the fixer (Branding + Learned terms).
  • nlp: The spaCy language model used for linguistic context checks.
StyleFixer(config_path: Optional[pathlib.Path] = None)
38    def __init__(self, config_path: Optional[Path] = None) -> None:
39        """
40        Initializes the fixer and loads the persistent Knowledge Base.
41        
42        Args:
43            config_path (Path, optional): Custom path to pyproject.toml.
44        """
45        self.config_path = config_path or Path("pyproject.toml")
46        self.config = self._load_config()
47        
48        # Load the Knowledge Base (JSON) which stores branding and learned words.
49        kb_setting = self.config.get("pipeline", {}).get("knowledge_base", "data/knowledge_base.json")
50        self.kb_path = Path(kb_setting)
51        self.kb = self._load_kb()
52
53        try:
54            self.nlp = spacy.load("en_core_web_sm")
55        except Exception:
56            # Fallback if spaCy is missing; some tense-shifting features may be limited.
57            self.nlp = None

Initializes the fixer and loads the persistent Knowledge Base.

Arguments:
  • config_path (Path, optional): Custom path to pyproject.toml.
config_path
config
kb_path
kb
def fix_file(self, file_path: pathlib.Path, violations: List[Dict[str, Any]]) -> int:
131    def fix_file(self, file_path: Path, violations: List[Dict[str, Any]]) -> int:
132        """
133        The main repair loop. Iterates through line-specific violations and 
134        applies branding and style corrections.
135
136        Args:
137            file_path (Path): Path to the generated AsciiDoc file.
138            violations (List[Dict]): List of findings from the Linter.
139            
140        Returns:
141            int: Number of lines successfully modified.
142        """
143        if not file_path.exists(): 
144            return 0
145        content = file_path.read_text(encoding="utf-8").splitlines()
146        total_fixes = 0
147        
148        # Group issues by line number for efficient processing
149        line_map = defaultdict(list)
150        for v in violations: 
151            line_map[v.get("Line", 0)].append(v)
152
153        patterns = self.config.get("patterns", {})
154        extract_re = patterns.get("suggestion_extraction", r"'(.*?)'")
155        remove_trigger = patterns.get("removal_trigger", "removing")
156        instead_of_trigger = patterns.get("instead_of_trigger", "instead of")
157
158        # Current branding context (Permanent + Learned during this session)
159        session_branding = {**self.kb.get("learned", {}), **self.kb.get("automated_fixes", {})}
160
161        # Process lines in reverse order to ensure line-length changes don't shift indices
162        for line_num in sorted(line_map.keys(), reverse=True):
163            idx = line_num - 1
164            if idx < 0 or idx >= len(content): 
165                continue
166            
167            working_line = content[idx]
168            original_line = working_line
169
170            # --- PHASE 1: LINTER-DRIVEN REPAIRS ---
171            for issue in line_map[line_num]:
172                msg = issue.get("Message", "")
173                check_id = issue.get("Check", "")
174                suggestion = issue.get("Suggestion", "")
175
176                # 1. Branding Sync (e.g., Use 'SUSE' instead of 'suse')
177                for wrong, correct in session_branding.items():
178                    if f"'{wrong}'" in msg.lower() or f"‘{wrong}’" in msg.lower():
179                        working_line = re.sub(rf"\b{re.escape(wrong)}\b", correct, working_line, flags=re.IGNORECASE)
180
181                # 2. Surgical Removal (e.g., "Note that...", "Actually...")
182                if remove_trigger in msg.lower() or "Editorializing" in check_id:
183                    target = suggestion if suggestion else (re.findall(extract_re, msg)[0] if re.findall(extract_re, msg) else None)
184                    if target:
185                        working_line = re.sub(rf"\b{re.escape(target)}\b\s?", "", working_line, flags=re.IGNORECASE)
186
187                # 3. Phrasal Substitution (e.g., "Use 'X' instead of 'Y'")
188                elif instead_of_trigger in msg.lower():
189                    if suggestion:
190                        m = re.findall(extract_re, msg)
191                        wrong_term = m[1] if len(m) >= 2 else (m[0] if m else "")
192                        if wrong_term:
193                            # --- GUARDRAIL: Let repair.py handle complex tense shifts ---
194                            if "will" in wrong_term.lower() or "will" in msg.lower():
195                                continue
196                            working_line = re.sub(rf"\b{re.escape(wrong_term)}\b", suggestion, working_line, flags=re.IGNORECASE)
197
198                # 4. Auto-Learning: Capture spelling fixes into the Knowledge Base
199                elif "Spelling" in check_id:
200                    if suggestion and suggestion.lower() not in ["spelling", "spellings", "learned"]:
201                        match = re.findall(extract_re, msg)
202                        word_to_fix = match[0] if match else ""
203                        if word_to_fix:
204                            working_line = re.sub(rf"\b{re.escape(word_to_fix)}\b", suggestion, working_line)
205                            # Persist this correction for future automation
206                            if word_to_fix.lower() not in session_branding:
207                                self.kb["learned"][word_to_fix.lower()] = suggestion
208
209            # --- PHASE 2: GLOBAL BRANDING & FORMATTING GUARDRAILS ---
210            
211            # 1. Branding Guardrail: Apply core branding safely (no URL/Path corruption)
212            for wrong, correct in self.kb.get("automated_fixes", {}).items():
213                # Negative lookarounds (?<![\/-]) prevent breaking paths like /img/suse-logo.svg
214                pattern = rf"(?<![\/-])\b{re.escape(wrong)}\b(?![\/-])"
215                working_line = re.sub(pattern, correct, working_line, flags=re.IGNORECASE)
216
217            # 2. Fragment Healer: Ensure sentences start with capital letters
218            # Ignores lines starting with AsciiDoc technical syntax
219            if not re.match(r'^(image::|video::|xref:|link:|http|\[|:)', working_line, flags=re.IGNORECASE):
220                working_line = re.sub(r'(^|\.\s+)([a-z])', lambda m: m.group(1) + m.group(2).upper(), working_line)
221
222            # Update line in content if modifications were made
223            if working_line != original_line:
224                content[idx] = working_line
225                total_fixes += 1
226
227        # Write corrected content back and update the JSON brain
228        file_path.write_text("\n".join(content), encoding="utf-8")
229        self._save_kb() 
230        return total_fixes

The main repair loop. Iterates through line-specific violations and applies branding and style corrections.

Arguments:
  • file_path (Path): Path to the generated AsciiDoc file.
  • violations (List[Dict]): List of findings from the Linter.
Returns:

int: Number of lines successfully modified.

__version__ = '1.0.0'
def get_info() -> str:
27def get_info() -> str:
28    """
29    Returns the basic identity string for the package.
30
31    Returns:
32        str: A formatted string containing the tool name, version, and purpose.
33    """
34    return f"Transpiler-Pro v{__version__} - Enterprise Documentation Engine"

Returns the basic identity string for the package.

Returns:

str: A formatted string containing the tool name, version, and purpose.