python-pid/archive/repolib/parsedeb.py
2023-02-23 22:05:14 +02:00

372 lines
12 KiB
Python

#!/usr/bin/python3
"""
Copyright (c) 2022, Ian Santopietro
All rights reserved.
This file is part of RepoLib.
RepoLib is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
RepoLib is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with RepoLib. If not, see <https://www.gnu.org/licenses/>.
"""
import logging
from . import util
log = logging.getLogger(__name__)
class DebParseError(util.RepoError):
""" Exceptions related to parsing deb lines."""
def __init__(self, *args, code=1, **kwargs):
"""Exceptions related to parsing deb lines.
Arguments:
code (:obj:`int`, optional, default=1): Exception error code.
"""
super().__init__(*args, **kwargs)
self.code = code
def debsplit(line:str) -> list:
""" Improved string.split() with support for things like [] options.
Adapted from python-apt
Arguments:
line(str): The line to split up.
"""
line = line.strip()
line_list = line.split()
for i in line_list:
if util.url_validator(i):
line_list[line_list.index(i)] = decode_brackets(i)
line = ' '.join(line_list)
pieces:list = []
tmp:str = ""
# we are inside a [..] block
p_found = False
for char in line:
if char == '[':
p_found = True
tmp += char
elif char == ']':
p_found = False
tmp += char
elif char.isspace() and not p_found:
pieces.append(tmp)
tmp = ''
continue
else:
tmp += char
# append last piece
if len(tmp) > 0:
pieces.append(tmp)
return pieces
def encode_brackets(word:str) -> str:
""" Encodes any [ and ] brackets into URL-safe form
Technically we should never be recieving these, and there are other things
which should technically be encoded as well. However, square brackets
actively break the URL parsing, and must be strictly avoided.
Arguments:
word (str): the string to encode brackets in.
Returns:
`str`: the encoded string.
"""
word = word.replace('[', '%5B')
word = word.replace(']', '%5D')
return word
def decode_brackets(word:str) -> str:
""" Un-encodes [ and ] from the input
Since our downstream libraries should also be encoding these correctly, it
is better to retain these as the user entered, as that ensures they can
recognize it properly.
Arguments:
word (str): The string to decode.
Returns:
`str`: the decoded string.
"""
word = word.replace('%5B', '[')
word = word.replace('%5D', ']')
return word
def parse_name_ident(tail:str) -> tuple:
""" Find a Repolib name within the given comment string.
The name should be headed with "X-Repolib-Name:" and is not space terminated.
The ident should be headed with "X-Repolib-ID:" and is space terminated.
Either field ends at the end of a line, or at a subsequent definition of a
different field, or at a subsequent ' #' substring. Additionally, the ident
field ends with a subsequent space.
Arguments:
tail (str): The comment to search within.
Returns: tuple(name, ident, comment):
name (str): The detected name, or None
ident (str): The detected ident, or None
comment (str): The string with the name and ident removed
"""
tail = util.strip_hashes(tail)
# Used for sanity checking later
has_name = 'X-Repolib-Name' in tail
log.debug('Line name found: %s', has_name)
has_ident = 'X-Repolib-ID' in tail
log.debug('Line ident found: %s', has_ident)
parts: list = tail.split()
name_found = False
ident_found = False
name:str = ''
ident:str = ''
comment:str = ''
for item in parts:
log.debug("Checking line item: %s", item)
item_is_name = item.strip('#').strip().startswith('X-Repolib-Name')
item_is_ident = item.strip('#').strip().startswith('X-Repolib-ID')
if '#' in item and not item_is_name and not item_is_ident:
name_found = False
ident_found = False
elif item_is_name:
name_found = True
ident_found = False
continue
elif item_is_ident:
name_found = False
ident_found = True
continue
if name_found and not item_is_name:
name += f'{item} '
continue
elif ident_found and not item_is_ident:
ident += f'{item}'
ident_found = False
continue
elif not name_found and not ident_found:
c = item.strip('#')
comment += f'{c} '
name = name.strip()
ident = ident.strip()
comment = comment.strip()
if not name:
if ident:
name = ident
# Final sanity checking
if has_name and not name:
raise DebParseError(
f'Could not parse repository name from comment {comment}. Make sure '
'you have a space between the colon and the Name'
)
if has_ident and not ident:
raise DebParseError(
f'Could not parse repository ident from comment {comment}. Make sure '
'you have a space between the colon and the Ident'
)
return name, ident, comment
class ParseDeb:
""" Parsing for source entries.
Contains parsing helpers for one-line format sources.
"""
def __init__(self, debug:bool = False) -> None:
"""
Arguments:
debug (bool): In debug mode, the structured data is always returned
at the end, instead of checking for sanity (default: `False`)
"""
self.debug = debug
self.last_line: str = ''
self.last_line_valid: bool = False
self.curr_line: str = ''
self.curr_line_valid: bool = False
def parse_options(self, opt:str) -> dict:
""" Parses a string of options into a dictionary that repolib can use.
Arguments:
opt(str): The string with options returned from the line parser.
Returns:
`dict`: The dictionary of options with key:val pairs (may be {})
"""
opt = opt.strip()
opt = opt[1:-1].strip() # Remove enclosing brackets
options = opt.split()
parsed_options:dict = {}
for opt in options:
pre_key, values = opt.split('=')
values = values.split(',')
value:str = ' '.join(values)
try:
key:str = util.options_inmap[pre_key]
except KeyError:
raise DebParseError(
f'Could not parse line {self.curr_line}: option {opt} is '
'not a valid debian repository option or is unsupported.'
)
parsed_options[key] = value
return parsed_options
def parse_line(self, line:str) -> dict:
""" Parse a deb line into its individual parts.
Adapted from python-apt
Arguments:
line (str): The line input to parse
Returns:
(dict): a dict containing the requisite data.
"""
self.last_line = self.curr_line
self.last_line_valid = self.curr_line_valid
self.curr_line = line.strip()
parts:list = []
line_is_comment = self.curr_line == '#'
line_is_empty = self.curr_line == ''
if line_is_comment or line_is_empty:
raise DebParseError(f'Current line "{self.curr_line}" is empty')
line_parsed: dict = {}
line_parsed['enabled'] = True
line_parsed['name'] = ''
line_parsed['ident'] = ''
line_parsed['comments'] = []
line_parsed['repo_type'] = ''
line_parsed['uri'] = ''
line_parsed['suite'] = ''
line_parsed['components'] = []
line_parsed['options'] = {}
if line.startswith('#'):
line_parsed['enabled'] = False
line = util.strip_hashes(line)
parts = line.split()
if not parts[0] in ('deb', 'deb-src'):
raise DebParseError(f'Current line "{self.curr_line}" is invalid')
comments_index = line.find('#')
if comments_index > 0:
raw_comments:str = line[comments_index + 1:].strip()
(
line_parsed['name'],
line_parsed['ident'],
comments
) = parse_name_ident(raw_comments)
line_parsed['comments'].append(comments)
line = line[:comments_index]
parts = debsplit(line)
if len(parts) < 3: # We need at least a type, a URL, and a component
raise DebParseError(
f'The line "{self.curr_line}" does not have enough pieces to be'
'valid'
)
# Determine the type of the repo
repo_type:str = parts.pop(0)
if repo_type in ['deb', 'deb-src']:
line_parsed['repo_type'] = util.SourceType(repo_type)
else:
raise DebParseError(f'The line "{self.curr_line}" is of invalid type.')
# Determine the properties of our repo line
uri_index:int = 0
is_cdrom: bool = False
## The URI index is the vital piece of information we need to parse the
## deb line, as it's position determines what other components are
## present and where they are. This determines the location of the URI
## regardless of where it's at.
for part in parts:
if part.startswith('['):
if 'cdrom' in part:
is_cdrom = True
uri_index = parts.index(part)
else:
uri_index = 1
if is_cdrom:
# This could maybe change if the parser now differentiates between
# CDROM URIs and option lists
raise DebParseError('Repolib cannot currently accept CDROM Sources')
if uri_index != 0:
line_parsed['options'] = self.parse_options(parts.pop(0))
if len(line_parsed) < 2: # Should have at minimum a URI and a suite/path
raise DebParseError(
f'The line "{self.curr_line}" does not have enough pieces to be'
'valid'
)
line_uri = parts.pop(0)
if util.url_validator(line_uri):
line_parsed['uri'] = line_uri
else:
raise DebParseError(
f'The line "{self.curr_line}" has invalid URI: {line_uri}'
)
line_parsed['suite'] = parts.pop(0)
line_components:list = []
for comp in parts:
line_parsed['components'].append(comp)
has_type = line_parsed['repo_type']
has_uri = line_parsed['uri']
has_suite = line_parsed['suite']
if has_type and has_uri and has_suite:
# if we have these three minimum components, we can proceed and the
# line is valid. Otherwise, error out.
return line_parsed.copy()
if self.debug:
return line_parsed.copy()
raise DebParseError(
f'The line {self.curr_line} could not be parsed due to an '
'unknown error (Probably missing the repo type, URI, or a '
'suite/path).'
)