python-pid/archive/repolib/parsedeb.py

#!/usr/bin/python3

"""
Copyright (c) 2022, Ian Santopietro
All rights reserved.

This file is part of RepoLib.

RepoLib is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

RepoLib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with RepoLib.  If not, see <https://www.gnu.org/licenses/>.
"""

import logging

from . import util

log = logging.getLogger(__name__)

class DebParseError(util.RepoError):
    """ Exceptions related to parsing deb lines."""

    def __init__(self, *args, code=1, **kwargs):
        """Exceptions related to parsing deb lines.

        Arguments:
            code (:obj:`int`, optional, default=1): Exception error code.
    """
        super().__init__(*args, **kwargs)
        self.code = code

def debsplit(line:str) -> list:
    """ Improved string.split() with support for things like [] options.

    Adapted from python-apt

    Arguments:
        line(str): The line to split up.
    """
    line = line.strip()
    line_list = line.split()
    for i in line_list:
        if util.url_validator(i):
            line_list[line_list.index(i)] = decode_brackets(i)
    line = ' '.join(line_list)
    pieces:list = []
    tmp:str = ""
    # we are inside a [..] block
    p_found = False
    for char in line:
        if char == '[':
            p_found = True
            tmp += char
        elif char == ']':
            p_found = False
            tmp += char
        elif char.isspace() and not p_found:
            pieces.append(tmp)
            tmp = ''
            continue
        else:
            tmp += char
    # append last piece
    if len(tmp) > 0:
        pieces.append(tmp)
    return pieces

def encode_brackets(word:str) -> str:
    """ Encodes any [ and ] brackets into URL-safe form

    Technically we should never be recieving these, and there are other things
    which should technically be encoded as well. However, square brackets
    actively break the URL parsing, and must be strictly avoided.

    Arguments:
        word (str): the string to encode brackets in.

    Returns:
        `str`: the encoded string.
    """
    word = word.replace('[', '%5B')
    word = word.replace(']', '%5D')
    return word

def decode_brackets(word:str) -> str:
    """ Un-encodes [ and ] from the input

    Since our downstream libraries should also be encoding these correctly, it
    is better to retain these as the user entered, as that ensures they can
    recognize it properly.

    Arguments:
        word (str): The string to decode.

    Returns:
        `str`: the decoded string.
    """
    word = word.replace('%5B', '[')
    word = word.replace('%5D', ']')
    return word

def parse_name_ident(tail:str) -> tuple:
    """ Find a Repolib name within the given comment string.

    The name should be headed with "X-Repolib-Name:" and is not space terminated.
    The ident should be headed with "X-Repolib-ID:" and is space terminated.

    Either field ends at the end of a line, or at a subsequent definition of a
    different field, or at a subsequent ' #' substring. Additionally, the ident
    field ends with a subsequent space.

    Arguments:
        tail (str): The comment to search within.

    Returns: tuple(name, ident, comment):
        name (str): The detected name, or None
        ident (str): The detected ident, or None
        comment (str): The string with the name and ident removed
    """
    tail = util.strip_hashes(tail)

    # Used for sanity checking later
    has_name = 'X-Repolib-Name' in tail
    log.debug('Line name found: %s', has_name)
    has_ident = 'X-Repolib-ID' in tail
    log.debug('Line ident found: %s', has_ident)

    parts: list = tail.split()
    name_found = False
    ident_found = False
    name:str = ''
    ident:str = ''
    comment:str = ''
    for item in parts:
        log.debug("Checking line item: %s", item)
        item_is_name = item.strip('#').strip().startswith('X-Repolib-Name')
        item_is_ident = item.strip('#').strip().startswith('X-Repolib-ID')

        if '#' in item and not item_is_name and not item_is_ident:
            name_found = False
            ident_found = False

        elif item_is_name:
            name_found = True
            ident_found = False
            continue

        elif item_is_ident:
            name_found = False
            ident_found = True
            continue

        if name_found and not item_is_name:
            name += f'{item} '
            continue

        elif ident_found and not item_is_ident:
            ident += f'{item}'
            ident_found = False
            continue

        elif not name_found and not ident_found:
            c = item.strip('#')
            comment += f'{c} '

    name = name.strip()
    ident = ident.strip()
    comment = comment.strip()

    if not name:
        if ident:
            name = ident

    # Final sanity checking
    if has_name and not name:
        raise DebParseError(
            f'Could not parse repository name from comment {comment}. Make sure '
            'you have a space between the colon and the Name'
        )
    if has_ident and not ident:
        raise DebParseError(
            f'Could not parse repository ident from comment {comment}. Make sure '
            'you have a space between the colon and the Ident'
        )

    return name, ident, comment


class ParseDeb:
    """ Parsing for source entries.

    Contains parsing helpers for one-line format sources.
    """

    def __init__(self, debug:bool = False) -> None:
        """
        Arguments:
            debug (bool): In debug mode, the structured data is always returned
                at the end, instead of checking for sanity (default: `False`)
        """
        self.debug = debug
        self.last_line: str = ''
        self.last_line_valid: bool = False
        self.curr_line: str = ''
        self.curr_line_valid: bool = False

    def parse_options(self, opt:str) -> dict:
        """ Parses a string of options into a dictionary that repolib can use.

        Arguments:
            opt(str): The string with options returned from the line parser.

        Returns:
            `dict`: The dictionary of options with key:val pairs (may be {})
        """
        opt = opt.strip()
        opt = opt[1:-1].strip() # Remove enclosing brackets
        options = opt.split()

        parsed_options:dict = {}

        for opt in options:
            pre_key, values = opt.split('=')
            values = values.split(',')
            value:str = ' '.join(values)
            try:
                key:str = util.options_inmap[pre_key]
            except KeyError:
                raise DebParseError(
                    f'Could not parse line {self.curr_line}: option {opt} is '
                    'not a valid debian repository option or is unsupported.'
                )
            parsed_options[key] = value

        return parsed_options


    def parse_line(self, line:str) -> dict:
        """ Parse a deb line into its individual parts.

        Adapted from python-apt

        Arguments:
            line (str): The line input to parse

        Returns:
            (dict): a dict containing the requisite data.
        """
        self.last_line = self.curr_line
        self.last_line_valid = self.curr_line_valid
        self.curr_line = line.strip()
        parts:list = []

        line_is_comment = self.curr_line == '#'
        line_is_empty = self.curr_line == ''
        if line_is_comment or line_is_empty:
            raise DebParseError(f'Current line "{self.curr_line}" is empty')

        line_parsed: dict = {}
        line_parsed['enabled'] = True
        line_parsed['name'] = ''
        line_parsed['ident'] = ''
        line_parsed['comments'] = []
        line_parsed['repo_type'] = ''
        line_parsed['uri'] = ''
        line_parsed['suite'] = ''
        line_parsed['components'] = []
        line_parsed['options'] = {}

        if line.startswith('#'):
            line_parsed['enabled'] = False
            line = util.strip_hashes(line)
            parts = line.split()
            if not parts[0] in ('deb', 'deb-src'):
                raise DebParseError(f'Current line "{self.curr_line}" is invalid')

        comments_index = line.find('#')
        if comments_index > 0:
            raw_comments:str = line[comments_index + 1:].strip()
            (
                line_parsed['name'],
                line_parsed['ident'],
                comments
            ) = parse_name_ident(raw_comments)
            line_parsed['comments'].append(comments)
            line = line[:comments_index]

        parts = debsplit(line)
        if len(parts) < 3: # We need at least a type, a URL, and a component
            raise DebParseError(
                f'The line "{self.curr_line}" does not have enough pieces to be'
                'valid'
            )
        # Determine the type of the repo
        repo_type:str = parts.pop(0)
        if repo_type in ['deb', 'deb-src']:
            line_parsed['repo_type'] = util.SourceType(repo_type)
        else:
            raise DebParseError(f'The line "{self.curr_line}" is of invalid type.')

        # Determine the properties of our repo line
        uri_index:int = 0
        is_cdrom: bool = False
        ## The URI index is the vital piece of information we need to parse the
        ## deb line, as it's position determines what other components are
        ## present and where they are. This determines the location of the URI
        ## regardless of where it's at.
        for part in parts:
            if part.startswith('['):
                if 'cdrom' in part:
                    is_cdrom = True
                    uri_index = parts.index(part)
                else:
                    uri_index = 1

        if is_cdrom:
            # This could maybe change if the parser now differentiates between
            # CDROM URIs and option lists
            raise DebParseError('Repolib cannot currently accept CDROM Sources')

        if uri_index != 0:
            line_parsed['options'] = self.parse_options(parts.pop(0))

        if len(line_parsed) < 2: # Should have at minimum a URI and a suite/path
            raise DebParseError(
                f'The line "{self.curr_line}" does not have enough pieces to be'
                'valid'
            )

        line_uri = parts.pop(0)
        if util.url_validator(line_uri):
            line_parsed['uri'] = line_uri

        else:
            raise DebParseError(
                f'The line "{self.curr_line}" has invalid URI: {line_uri}'
            )

        line_parsed['suite'] = parts.pop(0)

        line_components:list = []
        for comp in parts:
            line_parsed['components'].append(comp)


        has_type = line_parsed['repo_type']
        has_uri = line_parsed['uri']
        has_suite = line_parsed['suite']

        if has_type and has_uri and has_suite:
            # if we have these three minimum components, we can proceed and the
            # line is valid. Otherwise, error out.
            return line_parsed.copy()

        if self.debug:
            return line_parsed.copy()

        raise DebParseError(
            f'The line {self.curr_line} could not be parsed due to an '
            'unknown error (Probably missing the repo type, URI, or a '
            'suite/path).'
        )