Source code for nbrefactor.processor.parser

""" Parsing methods for both code and markdown cells in a notebook
"""

import re
from copy import copy
from .cda import analyze_code_cell
from ..datastructs import MarkdownHeader, MarkdownCommand
from ..datastructs import ParsedCodeCell, ParsedMarkdownCell


[docs]def parse_code_cell(cell_idx, source, module_node):
    """
    Parses a code cell into a ParsedCodeCell object.
    
    Args:
        cell_idx (int): index of the cell in the notebook.
        source (str): the cell's index in the notebook.
        module_node (:class:`~nbrefactor.datastructs.ModuleNode`): the module \
            node in the built module tree
        
    Returns:
        ParsedCodeCell: a parsed code cell object with analyzed imports, \
            definitions, usages, etc.
    """

    parsed_code = analyze_code_cell(source, module_node.get_full_path())

    return ParsedCodeCell(
        cell_idx=cell_idx,
        raw_source=source,
        parsed_source=parsed_code['source'], 
        dependencies=parsed_code['dependencies'],
        module_node=module_node
    )


[docs]def parse_markdown_cell(cell_idx, source):
    """
    Parses a markdown cell into a ParsedMarkdownCell object.
    
    Args:
        cell_idx (int): the cell's index in the notebook.
        source (str): the cell's markdown content.
        
    Returns:
        ParsedMarkdownCell: A parsed markdown cell object containing headers \
            and commands.
    """

    md_str = copy(source)   # copied to prevent in-place modification

    # Checking for multiline HTML comments whilst preserving the sequential 
    # order of headers and commands is such a pain (since regex is not really 
    # a "parser").
    # For now, we'll just assess each line individually to extract 
    # headers/commands 
    # comment_regex = re.compile(r'<!--(?P<comment>(?:(.|\n)*?))-->')

    cmd_regex = re.compile(r'\$\b(?P<command>\w+(?:-\w+)*)'
                           r'(?:=(?P<value>.*?))?(?=\s|$|[^\w])')

    md_elements = []        # both the Command + MarkdownHeader objects, in 
                            # the order they appear in
                            # we don't simlpy match and extract 
                            # commands/headers across the raw source
                            # to maintain the order of execution (in case 
                            # multiple headers/commands are present)

    warnings = []

    # line by line parsing (to maintain sequential order)
    for line in md_str.split('\n'):
        clean_line = line.strip()

        # [DEPRECATED / possibly left for future implementations]
        # # HTML comments' matching (potentially Command objects)
        # comment_match = comment_regex.search(clean_line)
        # if comment_match:
        #     comment = comment_match.group('comment')
        #     comment_lines = comment.split('\n')
        #     for comment_line in comment_lines:
        #         # parse comment lines (we're allowing multiple commands 
        #         # in a single comment block)

        cmd_match = cmd_regex.search(clean_line.strip())

        if cmd_match:
            cmd_str = cmd_match.group('command')
            value = cmd_match.group('value')
            
            try:
                cmd = MarkdownCommand(cmd_str, value)   # will raise a value 
                                                        # error if the cmd_str 
                                                        # is invalid or if an 
                                                        # expected value-type 
                                                        # is not found
                md_elements.append(cmd)
            except ValueError as e:
                err = e.args[0]
                err['cell_idx'] =  cell_idx             # attach cell index 
                                                        # for logging
                warnings.append(err)
                
            continue

        # MD header -> extract node depth/level and name of package/module
        if clean_line.startswith('#'):
            md_level = clean_line.count('#')
            md_header = clean_line.strip('#').strip()
            md_elements.append(MarkdownHeader(md_header, md_level))

    return ParsedMarkdownCell(md_elements, warnings)