diff --git a/tools/checkorder/checkorder.py b/tools/checkorder/checkorder.py index b4c94707..4f748dcd 100644 --- a/tools/checkorder/checkorder.py +++ b/tools/checkorder/checkorder.py @@ -2,7 +2,10 @@ import sys import argparse from typing import TextIO -from isledecomp.dir import walk_source_dir +from isledecomp.dir import ( + walk_source_dir, + file_is_cpp +) from isledecomp.parser import find_code_blocks from isledecomp.parser.util import ( is_exact_offset_comment diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py index d8aa6fbf..903e9064 100644 --- a/tools/isledecomp/isledecomp/parser/parser.py +++ b/tools/isledecomp/isledecomp/parser/parser.py @@ -7,6 +7,8 @@ is_blank_or_comment, match_offset_comment, is_exact_offset_comment, + template_function_name, + remove_trailing_comment, ) @@ -14,6 +16,9 @@ class ReaderState(Enum): WANT_OFFSET = 0 WANT_SIG = 1 IN_FUNC = 2 + IN_TEMPLATE = 3 + WANT_CURLY = 4 + FUNCTION_DONE = 5 def find_code_blocks(stream: TextIO) -> List[CodeBlock]: @@ -25,53 +30,106 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: blocks = [] - offset = None + offset_match = None offset_comment = None function_sig = None start_line = None + end_line = None state = ReaderState.WANT_OFFSET - for line_no, line in enumerate(stream): - if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC): + # 1-based to match cvdump and your text-editor + # I know it says 0, but we will increment before each readline() + line_no = 0 + can_seek = True + + while True: + # Do this before reading again so that an EOF will not + # cause us to miss the last function of the file. + if state == ReaderState.FUNCTION_DONE: + block = CodeBlock(offset=offset_match.address, + signature=function_sig, + start_line=start_line, + end_line=end_line, + offset_comment=offset_comment, + module=offset_match.module, + is_template=offset_match.is_template, + is_stub=offset_match.is_stub) + blocks.append(block) + state = ReaderState.WANT_OFFSET + + if can_seek: + line_no += 1 + line = stream.readline() + if line == '': + break + + if (state != ReaderState.WANT_OFFSET and + match_offset_comment(line) is not None): + # We hit another offset unexpectedly. + # We can recover easily by just ending the function here. + end_line = line_no - 1 + state = ReaderState.FUNCTION_DONE + + # Pause reading here so we handle the offset marker + # on the next loop iteration + can_seek = False + + # Regular state machine handling begins now + if state == ReaderState.IN_TEMPLATE: + # TEMPLATE functions are a special case. The signature is + # given on the next line (in a // comment) + function_sig = template_function_name(line) + start_line = line_no + end_line = line_no + state = ReaderState.FUNCTION_DONE + + elif state == ReaderState.WANT_SIG: + # Skip blank lines or comments that come after the offset + # marker. There is not a formal procedure for this, so just + # assume the next "code line" is the function signature + if not is_blank_or_comment(line): + function_sig = remove_trailing_comment(line.strip()) + + # Now check to see if the opening curly bracket is on the + # same line. clang-format should prevent this (BraceWrapping) + # but it is easy to detect. + # If the entire function is on one line, we can handle that + # too, although this should be limited to inlines. + if function_sig.endswith('{'): + start_line = line_no + state = ReaderState.IN_FUNC + elif (function_sig.endswith('}') or + function_sig.endswith('};')): + start_line = line_no + end_line = line_no + state = ReaderState.FUNCTION_DONE + else: + state = ReaderState.WANT_CURLY + + elif state == ReaderState.WANT_CURLY: + if line.strip() == '{': + start_line = line_no + state = ReaderState.IN_FUNC + + elif state == ReaderState.IN_FUNC: # Naive but reasonable assumption that functions will end with # a curly brace on its own line with no prepended spaces. if line.startswith('}'): - # TODO: could streamline this and the next case - block = CodeBlock(offset=offset, - signature=function_sig, - start_line=start_line, - end_line=line_no, - offset_comment=offset_comment) + end_line = line_no + state = ReaderState.FUNCTION_DONE - blocks.append(block) - state = ReaderState.WANT_OFFSET - elif match_offset_comment(line) is not None: - # We hit another offset unexpectedly before detecting the - # end of the function. We can recover easily by just - # ending the function here. - block = CodeBlock(offset=offset, - signature=function_sig, - start_line=start_line, - end_line=line_no - 1, - offset_comment=offset_comment) - - blocks.append(block) - state = ReaderState.WANT_OFFSET - - # We want to grab the function signature so we can identify - # the code block. Skip any blank lines or comments - # that follow the offset comment. - elif (not is_blank_or_comment(line) - and state == ReaderState.WANT_SIG): - function_sig = line.strip() - state = ReaderState.IN_FUNC - - if state == ReaderState.WANT_OFFSET: + elif state == ReaderState.WANT_OFFSET: + # If we detected an offset marker unexpectedly, we are handling + # it here so we can continue seeking. + can_seek = True match = match_offset_comment(line) if match is not None: - offset = int(match, 16) + offset_match = match offset_comment = line.strip() - start_line = line_no - state = ReaderState.WANT_SIG + + if match.is_template: + state = ReaderState.IN_TEMPLATE + else: + state = ReaderState.WANT_SIG return blocks diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py index d69b6c2e..4bf53a97 100644 --- a/tools/isledecomp/isledecomp/parser/util.py +++ b/tools/isledecomp/isledecomp/parser/util.py @@ -5,17 +5,47 @@ CodeBlock = namedtuple('CodeBlock', ['offset', 'signature', 'start_line', 'end_line', - 'offset_comment']) + 'offset_comment', 'module', 'is_template', 'is_stub']) +OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', + 'is_template', 'is_stub']) + +# This has not been formally established, but considering that "STUB" +# is a temporary state for a function, we assume it will appear last, +# after any other modifiers (i.e. TEMPLATE) # To match a reasonable variance of formatting for the offset comment -offsetCommentRegex = re.compile(r'//\s*OFFSET:\s*\w+\s+(?:0x)?([a-f0-9]+)(\s+STUB)?', +offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8 flags=re.I) # To match the exact syntax (text upper case, hex lower case, with spaces) # that is used in most places -# TODO: template and stub mutually exclusive? -offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)(?: STUB| TEMPLATE)?$') +offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8 + + +# The goal here is to just read whatever is on the next line, so some +# flexibility in the formatting seems OK +templateCommentRegex = re.compile(r'\s*//\s+(.*)') + + +# To remove any comment (//) or block comment (/*) and its leading spaces +# from the end of a code line +trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$') + + +def template_function_name(line: str) -> str: + """Parse function signature for special TEMPLATE functions""" + template_match = templateCommentRegex.match(line) + + # If we don't match, you get whatever is on the line as the signature + if template_match is not None: + return template_match.group(1) + else: + return line + + +def remove_trailing_comment(line: str) -> str: + return trailingCommentRegex.sub('', line) def is_blank_or_comment(line: str) -> bool: @@ -35,11 +65,12 @@ def is_exact_offset_comment(line: str) -> bool: return offsetCommentExactRegex.match(line) is not None -def match_offset_comment(line: str) -> str | None: - # TODO: intended to skip the expensive regex match, but is it necessary? - # TODO: this will skip indented offsets - if not line.startswith('//'): +def match_offset_comment(line: str) -> OffsetMatch | None: + match = offsetCommentRegex.match(line) + if match is None: return None - match = offsetCommentRegex.match(line) - return match.group(1) if match is not None else None + return OffsetMatch(module=match.group(1), + address=int(match.group(2), 16), + is_template=match.group(3) is not None, + is_stub=match.group(4) is not None) diff --git a/tools/isledecomp/tests/samples/basic_class.cpp b/tools/isledecomp/tests/samples/basic_class.cpp new file mode 100644 index 00000000..018eed41 --- /dev/null +++ b/tools/isledecomp/tests/samples/basic_class.cpp @@ -0,0 +1,29 @@ +// Sample for python unit tests +// Not part of the decomp + +// A very simple class + +class TestClass { +public: + TestClass(); + virtual ~TestClass() override; + + virtual MxResult Tickle() override; // vtable+08 + + // OFFSET: LEGO1 0x12345678 + inline const char* ClassName() const // vtable+0c + { + // 0xabcd1234 + return "TestClass"; + } + + // OFFSET: LEGO1 0xdeadbeef + inline MxBool IsA(const char* name) const override // vtable+10 + { + return !strcmp(name, TestClass::ClassName()); + } + +private: + int m_hello; + int m_hiThere; +}; diff --git a/tools/isledecomp/tests/samples/inline.cpp b/tools/isledecomp/tests/samples/inline.cpp new file mode 100644 index 00000000..c2cb698c --- /dev/null +++ b/tools/isledecomp/tests/samples/inline.cpp @@ -0,0 +1,8 @@ +// Sample for python unit tests +// Not part of the decomp + +// OFFSET: LEGO1 0x10000001 +inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there + +// OFFSET: LEGO1 0x10000002 +inline const char* OneLine() const { return "MxDSObject"; }; diff --git a/tools/isledecomp/tests/samples/poorly_formatted.cpp b/tools/isledecomp/tests/samples/poorly_formatted.cpp new file mode 100644 index 00000000..32dd774c --- /dev/null +++ b/tools/isledecomp/tests/samples/poorly_formatted.cpp @@ -0,0 +1,23 @@ +// Sample for python unit tests +// Not part of the decomp + +// While it's reasonable to expect a well-formed file (and clang-format +// will make sure we get one), this will put the parser through its paces. + +// OFFSET: TEST 0x1234 +void curly_with_spaces() + { + static char* msg = "hello"; + } + +// OFFSET: TEST 0x5555 +void weird_closing_curly() +{ + int x = 123; } + +// OFFSET: HELLO 0x5656 +void bad_indenting() { + if (0) +{ + int y = 5; +}} diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py index e76553fc..5b7a59ff 100644 --- a/tools/isledecomp/tests/test_parser.py +++ b/tools/isledecomp/tests/test_parser.py @@ -30,9 +30,10 @@ def test_sanity(): assert len(blocks) == 3 assert code_blocks_are_sorted(blocks) is True - # n.b. The parser returns line numbers as 0-based - assert blocks[0].start_line == 5 - assert blocks[0].end_line == 9 + # n.b. The parser returns line numbers as 1-based + # Function starts when we see the opening curly brace + assert blocks[0].start_line == 8 + assert blocks[0].end_line == 10 def test_oneline(): @@ -42,11 +43,7 @@ def test_oneline(): blocks = find_code_blocks(f) assert len(blocks) == 2 - assert blocks[0].start_line == 3 - # TODO: Because of the way it works now, this captures the blank line - # as part of the function. That's not *incorrect* per se, but - # this needs to be more consistent if we want the tool to sort the - # code blocks in the file. + assert blocks[0].start_line == 5 assert blocks[0].end_line == 5 @@ -70,3 +67,39 @@ def test_jumbled_case(): assert len(blocks) == 3 assert code_blocks_are_sorted(blocks) is False + + +def test_bad_file(): + with sample_file('poorly_formatted.cpp') as f: + blocks = find_code_blocks(f) + + assert len(blocks) == 3 + + +def test_indented(): + """Offsets for functions inside of a class will probably be indented.""" + with sample_file('basic_class.cpp') as f: + blocks = find_code_blocks(f) + + # TODO: We don't properly detect the end of these functions + # because the closing brace is indented. However... knowing where each + # function ends is less important (for now) than capturing + # all the functions that are there. + + assert len(blocks) == 2 + assert blocks[0].offset == int('0x12345678', 16) + assert blocks[0].start_line == 15 + #assert blocks[0].end_line == 18 + + assert blocks[1].offset == int('0xdeadbeef', 16) + assert blocks[1].start_line == 22 + #assert blocks[1].end_line == 24 + +def test_inline(): + with sample_file('inline.cpp') as f: + blocks = find_code_blocks(f) + + assert len(blocks) == 2 + for block in blocks: + assert block.start_line is not None + assert block.start_line == block.end_line