rspec/rspec-tools/rspec_tools/validation/sanitize_asciidoc.py

""" Ensure the asciidoc code for a rule description follows best practices

Checks are:
* "ifdef"/"endif" blocks should be well-formed for RSPEC
* Inline code with backquotes is correctly escaped and balanced
* Include commands are not appended to other code
"""
from pathlib import Path
import re


VALID_IFDEF = "ifdef::env-github,rspecator-view[]"
VALID_ENDIF = "endif::env-github,rspecator-view[]"

VARIABLE_DECL = re.compile(r':\w+: ')

INCLUDE = re.compile(r'include::')

FORMATTING_CHARS = ['_', r'\*', r'\#']
WORD_FORMATTING_CHARS = [r'\~', r'\^']

# If the formatting char is repeated twice, it can go anywhere
UNCONSTRAINED_FORMATTING = '|'.join(x + x for x in FORMATTING_CHARS)
# Single formatting char are dangerous at the beginning of a word
FORMATTING_OPENING = '|'.join(r'(\W|^)' + x + r'\w' for x in FORMATTING_CHARS)
# Single formatting char are dangerous at the end of a word
FORMATTING_CLOSING = '|'.join(r'\w' + x + r'(\W|$)' for x in FORMATTING_CHARS)
# Word formatting is broken by spaces so we look for things like `#word#`
WORD_FORMATTING = "|".join(x + r'\S+' + x for x in WORD_FORMATTING_CHARS)

# We combine all the matchers
NEED_PROTECTION = re.compile('('
                             f'{UNCONSTRAINED_FORMATTING}|'
                             f'{FORMATTING_OPENING}|'
                             f'{FORMATTING_CLOSING}|'
                             f'{WORD_FORMATTING}'
                             ')')

# There is a regex trick here:
# We want to stop the search if there is a backquote
# We do that by matching backquote OR the closing passthrough
# Then we'll ignore any match of backquote
CLOSE_CONSTRAINED_PASSTHROUGH = re.compile(r'`|((?<!\s)\+(?=`))')

CLOSE_CONSTRAINED_BACKQUOTE = re.compile(r'`(?!\w)')
CLOSE_UNCONSTRAINED_BACKQUOTE = re.compile('``')

PASSTHROUGH_MACRO_TEXT = r'pass:\w*\[(\\\]|[^\]])*\]'

PASSTHROUGH_MACRO = re.compile(PASSTHROUGH_MACRO_TEXT)

# There is a regex trick here:
# We want to skip passthrough macros, to not find pass:[``whatever``]
# We do that by matching
# * EITHER passthrough macros including their ignored backquotes
# * OR backquotes
# Then we'll ignore any match of PASSTHROUGH_MACRO
BACKQUOTE = re.compile(PASSTHROUGH_MACRO_TEXT + r'|(?P<backquote>(``+)|(?<![\\\w])(`)(?!\s))')

def close_passthrough(count, pos, line):
    """Find the end of a passthrough block marked by *count* plus signs"""
    while count > 0:
        # `+++a++` will display '+a' in case of inbalance, we try to find the biggest closing block
        if count == 1:
            if not line[pos + count].isspace() and line[pos - 1] == '`':
                #constrained '+'. It is a passthrough only if it is directly around text and surrounded by backquotes: `+Some Content+`
                close_pattern = CLOSE_CONSTRAINED_PASSTHROUGH
            else:
                return pos
        else:
            close_pattern = re.compile('(' + r'\+' * count + ')')
        end = close_pattern.search(line, pos + count)
        if end and end.group(1):
            return end.end()
        count -= 1
    return pos


def skip_passthrough_macro(line, pos):
    '''If this is a passthrough macro, skip to the end'''
    if line[pos] == 'p':
        pm = PASSTHROUGH_MACRO.match(line, pos)
        if pm:
            return pm.end()
    return pos


def skip_passthrough_plus(line, pos):
    '''If this is a passthrough +, skip to the end'''
    if line[pos] == '+':
        count = 1
        while pos + count < len(line) and line[pos + count] == '+':
            count += 1
        return close_passthrough(count, pos, line)
    return pos


def close_inline_block(line: str, pos: int, closing_pattern: re.Pattern[str]):
    """Find the end of an inline block started with *pattern*"""
    content = ""
    while pos < len(line):
        pos = skip_passthrough_macro(line, pos)
        pos = skip_passthrough_plus(line, pos)
        if closing_pattern.match(line, pos):
            return pos, content
        content += line[pos]
        pos += 1
    return -1, content


class Sanitizer:
    def __init__(self, file: Path):
        assert file.exists()
        assert file.is_file()

        self._file = file
        self._is_env_open = False
        self._has_env = False
        self._error_count = 0
        self._is_inside_code = False
        self._empty_line = True
        self._previous_line_was_include = False

    def process(self) -> bool:
        content = self._file.read_text(encoding="utf-8")
        lines = content.splitlines(keepends=False)
        for line_index, line in enumerate(lines):
            if self._is_inside_code:
                if line == '----':
                    self._is_inside_code = False
                continue
            if line == '----':
                self._is_inside_code = True
                continue
            line_number = line_index + 1
            if line.startswith("ifdef::"):
                self._process_open_ifdef(line_number, line)
            elif line.startswith("endif::"):
                self._process_close_ifdef(line_number, line)
            elif not line.strip():
                self._empty_line = True
            else:
                self._process_description(line_number, line)
                self._empty_line = False

        if self._is_env_open:
            self._on_error(len(lines), "An ifdef command is opened but never closed.")

        return self._error_count

    def _process_open_ifdef(self, line_number: int, line: str):
        if self._has_env:
            message = "Only one ifdef command is allowed per file."
            if self._is_env_open:
                message += "\nThe previous ifdef command was not closed."
            self._on_error(line_number, message)

        self._has_env = True
        self._is_env_open = True

        # IDEs should be configured to properly display the description,
        # not the other way around.
        # "env-vscode" was used in the passed. Instead, user should be able to
        # toggle the rspecator view based on their needs. Help these users migrate.
        if "vscode" in line:
            self._on_error(
                line_number,
                "Configure VS Code to display rspecator-view by setting the asciidoctor attribute.",
            )
        elif line != VALID_IFDEF:
            self._on_error(
                line_number,
                f'Incorrect asciidoc environment. "{VALID_IFDEF}" should be used instead.',
            )

    def _process_close_ifdef(self, line_number: int, line: str):
        if not self._is_env_open:
            self._on_error(line_number, "Unexpected endif command.")

        self._is_env_open = False

        if line != VALID_ENDIF:
            self._on_error(
                line_number,
                f'Incorrect endif command. "{VALID_ENDIF}" should be used instead.',
            )

    def _process_description(self, line_number: int, line: str):
        if VARIABLE_DECL.match(line):
            return
        if self._previous_line_was_include and not self._empty_line:
            self._on_error(line_number - 1, '''An empty line is missing after the include.
This may result in broken tags and other display issues.
Make sure there are always empty lines before and after each include''')
        if INCLUDE.match(line):
            self._previous_line_was_include = True
            if not self._empty_line:
                self._on_error(line_number, '''An empty line is missing before the include.
This may result in broken tags and other display issues.
Make sure there are always empty lines before and after each include''')
            return
        else:
            self._previous_line_was_include = False
        pos = 0
        res = BACKQUOTE.search(line, pos)
        # We filter out matches for passthrough. See comment near the BACKQUOTE declaration
        while res and res.group('backquote'):
            pos = self._check_inlined_code(line_number, res.end(), line, res.group('backquote'))
            res = BACKQUOTE.search(line, pos)

    def _check_inlined_code(self, line_number: int, pos: int, line: str, opening_pattern: str):
        if len(opening_pattern) > 2:
            # Part of the backquotes are displayed as backquotes.
            self._on_error(line_number, 'Use "++" to isolate the backquotes you want to display from the ones that should be interpreted by AsciiDoc.')
            return pos
        elif len(opening_pattern) == 2:
            closing_pattern = CLOSE_UNCONSTRAINED_BACKQUOTE
        else:
            closing_pattern = CLOSE_CONSTRAINED_BACKQUOTE

        content_end, content = close_inline_block(line, pos, closing_pattern)
        if content_end < 0:
            message='Unbalanced code inlining tags.'
            if len(opening_pattern) == 1:
                message += '''
If you are trying to write inline code that is glued to text without a space,
you need to use double backquotes:
> Replace all `reference`s.
Will not display correctly. You need to write:
> Replace all ``reference``s.
'''
            self._on_error(line_number, message)
            return len(line)
        pos = content_end + len(opening_pattern)
        if NEED_PROTECTION.search(content):
            self._on_error (line_number, f'''
Using backquotes does not protect against asciidoc interpretation. Starting or
ending a word with '*', '#', '_' or having two of them consecutively will
trigger unintended behavior with the rest of the text.
Use ``++{content}++`` to avoid that.
If you really want to have formatting inside your code, you can write
``pass:n[{content}]``
''')
            return pos
        return pos

    def _on_error(self, line_number: int, message: str):
        print(f"{self._file}:{line_number} {message}")
        self._error_count += 1


def sanitize_asciidoc(file_path: Path):
    """Called by the CLI"""
    return Sanitizer(file_path).process()
Fix broken or dangerous backquotes Co-authored-by: Marco Borgeaud <89914223+marco-antognini-sonarsource@users.noreply.github.com> 2023-10-30 10:33:56 +01:00			`""" Ensure the asciidoc code for a rule description follows best practices`

			`Checks are:`
			`* "ifdef"/"endif" blocks should be well-formed for RSPEC`
			`* Inline code with backquotes is correctly escaped and balanced`
			`* Include commands are not appended to other code`
			`"""`
			`from pathlib import Path`
			`import re`


			`VALID_IFDEF = "ifdef::env-github,rspecator-view[]"`
			`VALID_ENDIF = "endif::env-github,rspecator-view[]"`

			`VARIABLE_DECL = re.compile(r':\w+: ')`

			`INCLUDE = re.compile(r'include::')`

			`FORMATTING_CHARS = ['_', r'\*', r'\#']`
			`WORD_FORMATTING_CHARS = [r'\~', r'\^']`

			`# If the formatting char is repeated twice, it can go anywhere`
			`UNCONSTRAINED_FORMATTING = '\|'.join(x + x for x in FORMATTING_CHARS)`
			`# Single formatting char are dangerous at the beginning of a word`
			`FORMATTING_OPENING = '\|'.join(r'(\W\|^)' + x + r'\w' for x in FORMATTING_CHARS)`
			`# Single formatting char are dangerous at the end of a word`
			`FORMATTING_CLOSING = '\|'.join(r'\w' + x + r'(\W\|$)' for x in FORMATTING_CHARS)`
			# Word formatting is broken by spaces so we look for things like `#word#`
			`WORD_FORMATTING = "\|".join(x + r'\S+' + x for x in WORD_FORMATTING_CHARS)`

			`# We combine all the matchers`
			`NEED_PROTECTION = re.compile('('`
			`f'{UNCONSTRAINED_FORMATTING}\|'`
			`f'{FORMATTING_OPENING}\|'`
			`f'{FORMATTING_CLOSING}\|'`
			`f'{WORD_FORMATTING}'`
			`')')`

			`# There is a regex trick here:`
			`# We want to stop the search if there is a backquote`
			`# We do that by matching backquote OR the closing passthrough`
			`# Then we'll ignore any match of backquote`
			CLOSE_CONSTRAINED_PASSTHROUGH = re.compile(r'`\|((?<!\s)\+(?=`))')

			CLOSE_CONSTRAINED_BACKQUOTE = re.compile(r'`(?!\w)')
			CLOSE_UNCONSTRAINED_BACKQUOTE = re.compile('``')

			`PASSTHROUGH_MACRO_TEXT = r'pass:\w\[(\\\]\|[^\]])\]'`

			`PASSTHROUGH_MACRO = re.compile(PASSTHROUGH_MACRO_TEXT)`

			`# There is a regex trick here:`
			# We want to skip passthrough macros, to not find pass:[``whatever``]
			`# We do that by matching`
			`# * EITHER passthrough macros including their ignored backquotes`
			`# * OR backquotes`
			`# Then we'll ignore any match of PASSTHROUGH_MACRO`
			BACKQUOTE = re.compile(PASSTHROUGH_MACRO_TEXT + r'\|(?P<backquote>(``+)\|(?<![\\\w])(`)(?!\s))')

			`def close_passthrough(count, pos, line):`
			`"""Find the end of a passthrough block marked by count plus signs"""`
			`while count > 0:`
			# `+++a++` will display '+a' in case of inbalance, we try to find the biggest closing block
			`if count == 1:`
			if not line[pos + count].isspace() and line[pos - 1] == '`':
			#constrained '+'. It is a passthrough only if it is directly around text and surrounded by backquotes: `+Some Content+`
			`close_pattern = CLOSE_CONSTRAINED_PASSTHROUGH`
			`else:`
			`return pos`
			`else:`
			`close_pattern = re.compile('(' + r'\+' * count + ')')`
			`end = close_pattern.search(line, pos + count)`
			`if end and end.group(1):`
			`return end.end()`
			`count -= 1`
			`return pos`


			`def skip_passthrough_macro(line, pos):`
			`'''If this is a passthrough macro, skip to the end'''`
			`if line[pos] == 'p':`
			`pm = PASSTHROUGH_MACRO.match(line, pos)`
			`if pm:`
			`return pm.end()`
			`return pos`


			`def skip_passthrough_plus(line, pos):`
			`'''If this is a passthrough +, skip to the end'''`
			`if line[pos] == '+':`
			`count = 1`
			`while pos + count < len(line) and line[pos + count] == '+':`
			`count += 1`
			`return close_passthrough(count, pos, line)`
			`return pos`


			`def close_inline_block(line: str, pos: int, closing_pattern: re.Pattern[str]):`
			`"""Find the end of an inline block started with pattern"""`
			`content = ""`
			`while pos < len(line):`
			`pos = skip_passthrough_macro(line, pos)`
			`pos = skip_passthrough_plus(line, pos)`
			`if closing_pattern.match(line, pos):`
			`return pos, content`
			`content += line[pos]`
			`pos += 1`
			`return -1, content`


			`class Sanitizer:`
			`def __init__(self, file: Path):`
			`assert file.exists()`
			`assert file.is_file()`

			`self._file = file`
			`self._is_env_open = False`
			`self._has_env = False`
			`self._error_count = 0`
			`self._is_inside_code = False`
			`self._empty_line = True`
			`self._previous_line_was_include = False`

			`def process(self) -> bool:`
			`content = self._file.read_text(encoding="utf-8")`
			`lines = content.splitlines(keepends=False)`
			`for line_index, line in enumerate(lines):`
			`if self._is_inside_code:`
			`if line == '----':`
			`self._is_inside_code = False`
			`continue`
			`if line == '----':`
			`self._is_inside_code = True`
			`continue`
			`line_number = line_index + 1`
			`if line.startswith("ifdef::"):`
			`self._process_open_ifdef(line_number, line)`
			`elif line.startswith("endif::"):`
			`self._process_close_ifdef(line_number, line)`
			`elif not line.strip():`
			`self._empty_line = True`
			`else:`
			`self._process_description(line_number, line)`
			`self._empty_line = False`

			`if self._is_env_open:`
			`self._on_error(len(lines), "An ifdef command is opened but never closed.")`

			`return self._error_count`

			`def _process_open_ifdef(self, line_number: int, line: str):`
			`if self._has_env:`
			`message = "Only one ifdef command is allowed per file."`
			`if self._is_env_open:`
			`message += "\nThe previous ifdef command was not closed."`
			`self._on_error(line_number, message)`

			`self._has_env = True`
			`self._is_env_open = True`

			`# IDEs should be configured to properly display the description,`
			`# not the other way around.`
			`# "env-vscode" was used in the passed. Instead, user should be able to`
			`# toggle the rspecator view based on their needs. Help these users migrate.`
			`if "vscode" in line:`
			`self._on_error(`
			`line_number,`
			`"Configure VS Code to display rspecator-view by setting the asciidoctor attribute.",`
			`)`
			`elif line != VALID_IFDEF:`
			`self._on_error(`
			`line_number,`
			`f'Incorrect asciidoc environment. "{VALID_IFDEF}" should be used instead.',`
			`)`

			`def _process_close_ifdef(self, line_number: int, line: str):`
			`if not self._is_env_open:`
			`self._on_error(line_number, "Unexpected endif command.")`

			`self._is_env_open = False`

			`if line != VALID_ENDIF:`
			`self._on_error(`
			`line_number,`
			`f'Incorrect endif command. "{VALID_ENDIF}" should be used instead.',`
			`)`

			`def _process_description(self, line_number: int, line: str):`
			`if VARIABLE_DECL.match(line):`
			`return`
			`if self._previous_line_was_include and not self._empty_line:`
			`self._on_error(line_number - 1, '''An empty line is missing after the include.`
			`This may result in broken tags and other display issues.`
			`Make sure there are always empty lines before and after each include''')`
			`if INCLUDE.match(line):`
			`self._previous_line_was_include = True`
			`if not self._empty_line:`
			`self._on_error(line_number, '''An empty line is missing before the include.`
			`This may result in broken tags and other display issues.`
			`Make sure there are always empty lines before and after each include''')`
			`return`
			`else:`
			`self._previous_line_was_include = False`
			`pos = 0`
			`res = BACKQUOTE.search(line, pos)`
			`# We filter out matches for passthrough. See comment near the BACKQUOTE declaration`
			`while res and res.group('backquote'):`
			`pos = self._check_inlined_code(line_number, res.end(), line, res.group('backquote'))`
			`res = BACKQUOTE.search(line, pos)`

			`def _check_inlined_code(self, line_number: int, pos: int, line: str, opening_pattern: str):`
			`if len(opening_pattern) > 2:`
			`# Part of the backquotes are displayed as backquotes.`
			`self._on_error(line_number, 'Use "++" to isolate the backquotes you want to display from the ones that should be interpreted by AsciiDoc.')`
			`return pos`
			`elif len(opening_pattern) == 2:`
			`closing_pattern = CLOSE_UNCONSTRAINED_BACKQUOTE`
			`else:`
			`closing_pattern = CLOSE_CONSTRAINED_BACKQUOTE`

			`content_end, content = close_inline_block(line, pos, closing_pattern)`
			`if content_end < 0:`
			`message='Unbalanced code inlining tags.'`
			`if len(opening_pattern) == 1:`
			`message += '''`
			`If you are trying to write inline code that is glued to text without a space,`
			`you need to use double backquotes:`
			> Replace all `reference`s.
			`Will not display correctly. You need to write:`
			> Replace all ``reference``s.
			`'''`
			`self._on_error(line_number, message)`
			`return len(line)`
			`pos = content_end + len(opening_pattern)`
			`if NEED_PROTECTION.search(content):`
			`self._on_error (line_number, f'''`
			`Using backquotes does not protect against asciidoc interpretation. Starting or`
			`ending a word with '*', '#', '_' or having two of them consecutively will`
			`trigger unintended behavior with the rest of the text.`
			Use ``++{content}++`` to avoid that.
			`If you really want to have formatting inside your code, you can write`
			``pass:n[{content}]``
			`''')`
			`return pos`
			`return pos`

			`def _on_error(self, line_number: int, message: str):`
			`print(f"{self._file}:{line_number} {message}")`
			`self._error_count += 1`


			`def sanitize_asciidoc(file_path: Path):`
			`"""Called by the CLI"""`
			`return Sanitizer(file_path).process()`