Source code for coalib.bearlib.languages.documentation.DocumentationExtraction
"""
Language and docstyle independent extraction of documenation comments.
Each of the functions is built upon one another, and at the last,
exposes a single function :func:`extract_documentation_with_markers`
which is used by :class:`.DocBaseClass`, to extract documentation.
"""
import re
from coalib.bearlib.languages.documentation.DocumentationComment import (
DocumentationComment, MalformedComment)
from coalib.results.TextPosition import TextPosition
from coalib.results.TextRange import TextRange
from textwrap import dedent
def _extract_doc_comment_simple(content, line, column, markers):
"""
Extract a documentation that starts at given beginning with simple layout.
The property of the simple layout is that there's no each-line marker. This
applies e.g. for python docstrings.
:param content: Presplitted lines of the source-code-string.
:param line: Line where the documentation comment starts (behind the
start marker). Zero-based.
:param column: Column where the documentation comment starts (behind the
start marker). Zero-based.
:param markers: The documentation identifying markers.
:return: If the comment matched layout a triple with end-of-comment
line, column and the extracted documentation. If not
matched, returns None.
"""
align_column = column - len(markers[0])
pos = content[line].find(markers[2], column)
if pos != -1:
return line, pos + len(markers[2]), content[line][column:pos]
doc_comment = content[line][column:]
line += 1
while line < len(content):
pos = content[line].find(markers[2])
if pos == -1:
line_column = len(content[line])-len(content[line].lstrip())
doc_comment += ('\n' if content[line][align_column:] == ''
else content[line].strip()+'\n'
if line_column < align_column
else content[line][align_column:])
else:
doc_comment += content[line][align_column:pos]
return line, pos + len(markers[2]), doc_comment
line += 1
return None
def _extract_doc_comment_continuous(content, line, column, markers):
"""
Extract a documentation that starts at given beginning with continuous
layout.
The property of the continuous layout is that the each-line-marker and the
end-marker do equal. Documentation is extracted until no further marker is
found. Applies e.g. for doxygen style python documentation::
## main
#
# detailed
:param content: Presplitted lines of the source-code-string.
:param line: Line where the documentation comment starts (behind the
start marker). Zero-based.
:param column: Column where the documentation comment starts (behind the
start marker). Zero-based.
:param markers: The documentation identifying markers.
:return: If the comment matched layout a triple with end-of-comment
line, column and the extracted documentation. If not
matched, returns None.
"""
marker_len = len(markers[1])
doc_comment = content[line][column:]
line += 1
while line < len(content):
pos = content[line].find(markers[1])
if pos == -1:
return line, 0, doc_comment
else:
doc_comment += content[line][pos + marker_len:]
line += 1
if content[line - 1][-1] == '\n':
column = 0
else:
# This case can appear on end-of-document without a ``\n``.
line -= 1
column = len(content[line])
return line, column, doc_comment
def _extract_doc_comment_standard(content, line, column, markers):
"""
Extract a documentation that starts at given beginning with standard
layout.
The standard layout applies e.g. for C doxygen-style documentation::
/**
* documentation
*/
:param content: Presplitted lines of the source-code-string.
:param line: Line where the documentation comment starts (behind the
start marker). Zero-based.
:param column: Column where the documentation comment starts (behind the
start marker). Zero-based.
:param markers: The documentation identifying markers.
:return: If the comment matched layout a triple with end-of-comment
line, column and the extracted documentation. If not
matched, returns None.
"""
pos = content[line].find(markers[2], column)
if pos != -1:
return line, pos + len(markers[2]), content[line][column:pos]
doc_comment = content[line][column:]
line += 1
while line < len(content):
pos = content[line].find(markers[2])
each_line_pos = content[line].find(markers[1])
if pos == -1:
if each_line_pos == -1:
# If the first text occurrence is not the each-line marker
# now we violate the doc-comment layout.
return None
doc_comment += content[line][each_line_pos + len(markers[1]):]
else:
# If no each-line marker found or it's located past the end marker:
# extract no further and end the doc-comment.
if each_line_pos != -1 and each_line_pos + 1 < pos:
doc_comment += content[line][each_line_pos +
len(markers[1]):pos]
return line, pos + len(markers[2]), doc_comment
line += 1
return None
def _extract_doc_comment(content, line, column, markers):
"""
Delegates depending on the given markers to the right extraction method.
:param content: Presplitted lines of the source-code-string.
:param line: Line where the documentation comment starts (behind the
start marker). Zero-based.
:param column: Column where the documentation comment starts (behind the
start marker). Zero-based.
:param markers: The documentation identifying markers.
:return: If the comment matched layout a triple with end-of-comment
line, column and the extracted documentation. If not
matched, returns None.
"""
if markers[1] == '':
# Extract and align to start marker.
return _extract_doc_comment_simple(content, line, column, markers)
elif markers[1] == markers[2]:
# Search for the each-line marker until it runs out.
return _extract_doc_comment_continuous(content, line, column, markers)
else:
return _extract_doc_comment_standard(content, line, column, markers)
def _compile_multi_match_regex(strings):
"""
Compiles a regex object that matches each of the given strings.
:param strings: The strings to match.
:return: A regex object.
"""
return re.compile('|'.join(re.escape(s) for s in strings))
def _extract_doc_comment_from_line(content, line, column, regex,
marker_dict, docstyle_definition):
cur_line = content[line]
begin_match = regex.search(cur_line, column)
if begin_match:
indent = cur_line[:begin_match.start()]
column = begin_match.end()
for marker in marker_dict[begin_match.group()]:
doc_comment = _extract_doc_comment(content, line, column, marker)
if doc_comment is not None:
end_line, end_column, documentation = doc_comment
position = TextPosition(line + 1, len(indent) + 1)
doc = DocumentationComment(documentation, docstyle_definition,
indent, marker, position)
break
if doc_comment:
return end_line, end_column, doc
else:
malformed_comment = MalformedComment(dedent("""\
Please check the docstring for faulty markers. A starting
marker has been found, but no instance of DocComment is
returned."""), line)
return line + 1, 0, malformed_comment
return line + 1, 0, None
[docs]def extract_documentation_with_markers(content, docstyle_definition):
"""
Extracts all documentation texts inside the given source-code-string.
:param content:
The source-code-string where to extract documentation from.
Needs to be a list or tuple where each string item is a single
line (including ending whitespaces like ``\\n``).
:param docstyle_definition:
The ``DocstyleDefinition`` instance that defines what docstyle is
being used in the documentation.
:return:
An iterator returning each DocumentationComment found in the content.
"""
# Prepare marker-tuple dict that maps a begin pattern to the corresponding
# marker_set(s). This makes it faster to retrieve a marker-set from a
# begin sequence we initially want to search for in source code. Then
# the possible found documentation match is processed further with the
# rest markers.
markers = docstyle_definition.markers
marker_dict = {}
for marker_set in markers:
if marker_set[0] not in marker_dict:
marker_dict[marker_set[0]] = [marker_set]
else:
marker_dict[marker_set[0]].append(marker_set)
# Using regexes to perform a variable match is faster than finding each
# substring with ``str.find()`` choosing the lowest match.
begin_regex = _compile_multi_match_regex(
marker_set[0] for marker_set in markers)
line = 0
column = 0
while line < len(content):
line, column, doc = _extract_doc_comment_from_line(
content,
line,
column,
begin_regex,
marker_dict,
docstyle_definition)
if doc and isinstance(doc, MalformedComment):
yield doc
elif doc:
# Ignore string literals
ignore_regex = re.compile(
r'^\s*r?(?P<marker>' +
('|'.join(re.escape(s) for s in doc.marker[0])) +
')')
# Starting line of doc_string where marker is present
start_line = doc.range.start.line - 1
ignore_string_match = ignore_regex.search(content[start_line])
# Instantiate padding
top_padding = 0
bottom_padding = 0
# minus 2 because we want to check the line before the marker.
start_index = doc.range.start.line - 2
end_index = doc.range.end.line
while start_index >= 0 and not content[start_index].strip():
top_padding += 1
start_index -= 1
# If the end_index is instantiated above the len(content) i.e.
# In case where ending marker of docstring is at the last line.
# Then the doc.bottom_padding will be default to 0. This will also
# prevent IndexError raised by content[end_index].
while end_index < len(content) and not content[end_index].strip():
# This condition will take place if theres an inline docstring
# following documentation.
if ((doc.marker[2]+'\n') != content[end_index-1][-4:]
and bottom_padding == 0):
break
bottom_padding += 1
end_index += 1
class_regex = re.compile(
doc.docstyle_definition.docstring_type_regex.class_sign)
function_regex = re.compile(
doc.docstyle_definition.docstring_type_regex.func_sign)
# End line differs when mid marker and end marker is different
if doc.marker[1] == doc.marker[2]:
end_index = end_index - 1
# Check for docstring_position and then check for class regex
# and function regex to define the type of docstring.
if doc.docstyle_definition.docstring_position == 'top':
if class_regex.search(content[start_index]):
doc.docstring_type = 'class'
elif function_regex.search(content[start_index]):
doc.docstring_type = 'function'
elif doc.docstyle_definition.docstring_position == 'bottom':
if (end_index < len(content) and
class_regex.search(content[end_index])):
doc.docstring_type = 'class'
elif (end_index < len(content) and
function_regex.search(content[end_index])):
doc.docstring_type = 'function'
# Disabled automatic padding for docstring_type='others' as this
# will cause overlapping of range in consecutive docstrings. Which
# diff.replace() is unable to handle.
if doc.docstring_type != 'others':
doc.top_padding = top_padding
doc.bottom_padding = bottom_padding
doc.range = TextRange.from_values(
start_index + 2,
1 if top_padding > 0 else doc.range.start.column,
end_index,
1 if bottom_padding > 0 else doc.range.end.column)
if ignore_string_match:
yield doc