Source code for nbrefactor.processor.parser
""" Parsing methods for both code and markdown cells in a notebook
"""
import re
from copy import copy
from .cda import analyze_code_cell
from ..datastructs import MarkdownHeader, MarkdownCommand
from ..datastructs import ParsedCodeCell, ParsedMarkdownCell
[docs]def parse_code_cell(cell_idx, source, module_node):
"""
Parses a code cell into a ParsedCodeCell object.
Args:
cell_idx (int): index of the cell in the notebook.
source (str): the cell's index in the notebook.
module_node (:class:`~nbrefactor.datastructs.ModuleNode`): the module \
node in the built module tree
Returns:
ParsedCodeCell: a parsed code cell object with analyzed imports, \
definitions, usages, etc.
"""
parsed_code = analyze_code_cell(source, module_node.get_full_path())
return ParsedCodeCell(
cell_idx=cell_idx,
raw_source=source,
parsed_source=parsed_code['source'],
dependencies=parsed_code['dependencies'],
module_node=module_node
)
[docs]def parse_markdown_cell(cell_idx, source):
"""
Parses a markdown cell into a ParsedMarkdownCell object.
Args:
cell_idx (int): the cell's index in the notebook.
source (str): the cell's markdown content.
Returns:
ParsedMarkdownCell: A parsed markdown cell object containing headers \
and commands.
"""
md_str = copy(source) # copied to prevent in-place modification
# Checking for multiline HTML comments whilst preserving the sequential
# order of headers and commands is such a pain (since regex is not really
# a "parser").
# For now, we'll just assess each line individually to extract
# headers/commands
# comment_regex = re.compile(r'<!--(?P<comment>(?:(.|\n)*?))-->')
cmd_regex = re.compile(r'\$\b(?P<command>\w+(?:-\w+)*)'
r'(?:=(?P<value>.*?))?(?=\s|$|[^\w])')
md_elements = [] # both the Command + MarkdownHeader objects, in
# the order they appear in
# we don't simlpy match and extract
# commands/headers across the raw source
# to maintain the order of execution (in case
# multiple headers/commands are present)
warnings = []
# line by line parsing (to maintain sequential order)
for line in md_str.split('\n'):
clean_line = line.strip()
# [DEPRECATED / possibly left for future implementations]
# # HTML comments' matching (potentially Command objects)
# comment_match = comment_regex.search(clean_line)
# if comment_match:
# comment = comment_match.group('comment')
# comment_lines = comment.split('\n')
# for comment_line in comment_lines:
# # parse comment lines (we're allowing multiple commands
# # in a single comment block)
cmd_match = cmd_regex.search(clean_line.strip())
if cmd_match:
cmd_str = cmd_match.group('command')
value = cmd_match.group('value')
try:
cmd = MarkdownCommand(cmd_str, value) # will raise a value
# error if the cmd_str
# is invalid or if an
# expected value-type
# is not found
md_elements.append(cmd)
except ValueError as e:
err = e.args[0]
err['cell_idx'] = cell_idx # attach cell index
# for logging
warnings.append(err)
continue
# MD header -> extract node depth/level and name of package/module
if clean_line.startswith('#'):
md_level = clean_line.count('#')
md_header = clean_line.strip('#').strip()
md_elements.append(MarkdownHeader(md_header, md_level))
return ParsedMarkdownCell(md_elements, warnings)