diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py index 483d4766..f5638eb0 100644 --- a/tools/isledecomp/isledecomp/parser/parser.py +++ b/tools/isledecomp/isledecomp/parser/parser.py @@ -10,6 +10,7 @@ is_exact_offset_comment, template_function_name, remove_trailing_comment, + distinct_module, ) @@ -31,11 +32,8 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: blocks = [] - offset_match = OffsetMatch(module=None, - address=None, - is_template=None, - is_stub=None) - offset_comment = None + offset_matches = [] + function_sig = None start_line = None end_line = None @@ -50,15 +48,19 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: # Do this before reading again so that an EOF will not # cause us to miss the last function of the file. if state == ReaderState.FUNCTION_DONE: - block = CodeBlock(offset=offset_match.address, - signature=function_sig, - start_line=start_line, - end_line=end_line, - offset_comment=offset_comment, - module=offset_match.module, - is_template=offset_match.is_template, - is_stub=offset_match.is_stub) - blocks.append(block) + # Our list of offset marks could have duplicates on + # module name, so we'll eliminate those now. + for offset_match in distinct_module(offset_matches): + block = CodeBlock(offset=offset_match.address, + signature=function_sig, + start_line=start_line, + end_line=end_line, + offset_comment=offset_match.comment, + module=offset_match.module, + is_template=offset_match.is_template, + is_stub=offset_match.is_stub) + blocks.append(block) + offset_matches = [] state = ReaderState.WANT_OFFSET if can_seek: @@ -67,19 +69,33 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: if line == '': break - if (state != ReaderState.WANT_OFFSET and - match_offset_comment(line) is not None): - # We hit another offset unexpectedly. - # We can recover easily by just ending the function here. - end_line = line_no - 1 - state = ReaderState.FUNCTION_DONE + new_match = match_offset_comment(line) + if new_match is not None: + # We will allow multiple offsets if we have just begun + # the code block, but not after we hit the curly brace. + if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE, + ReaderState.WANT_SIG): + # If we detected an offset marker unexpectedly, + # we are handling it here so we can continue seeking. + can_seek = True - # Pause reading here so we handle the offset marker - # on the next loop iteration - can_seek = False + offset_matches.append(new_match) - # Regular state machine handling begins now - if state == ReaderState.IN_TEMPLATE: + if new_match.is_template: + state = ReaderState.IN_TEMPLATE + else: + state = ReaderState.WANT_SIG + else: + # We hit another offset unexpectedly. + # We can recover easily by just ending the function here. + end_line = line_no - 1 + state = ReaderState.FUNCTION_DONE + + # Pause reading here so we handle the offset marker + # on the next loop iteration + can_seek = False + + elif state == ReaderState.IN_TEMPLATE: # TEMPLATE functions are a special case. The signature is # given on the next line (in a // comment) function_sig = template_function_name(line) @@ -92,13 +108,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: # marker. There is not a formal procedure for this, so just # assume the next "code line" is the function signature if not is_blank_or_comment(line): + # Inline functions may end with a comment. Strip that out + # to help parsing. function_sig = remove_trailing_comment(line.strip()) # Now check to see if the opening curly bracket is on the # same line. clang-format should prevent this (BraceWrapping) # but it is easy to detect. - # If the entire function is on one line, we can handle that - # too, although this should be limited to inlines. + # If the entire function is on one line, handle that too. if function_sig.endswith('{'): start_line = line_no state = ReaderState.IN_FUNC @@ -122,18 +139,4 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: end_line = line_no state = ReaderState.FUNCTION_DONE - elif state == ReaderState.WANT_OFFSET: - # If we detected an offset marker unexpectedly, we are handling - # it here so we can continue seeking. - can_seek = True - match = match_offset_comment(line) - if match is not None: - offset_match = match - offset_comment = line.strip() - - if match.is_template: - state = ReaderState.IN_TEMPLATE - else: - state = ReaderState.WANT_SIG - return blocks diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py index 61efa781..6536e2f0 100644 --- a/tools/isledecomp/isledecomp/parser/util.py +++ b/tools/isledecomp/isledecomp/parser/util.py @@ -1,5 +1,5 @@ # C++ Parser utility functions and data structures -from __future__ import annotations # python <3.10 compatibility +from __future__ import annotations # python <3.10 compatibility import re from collections import namedtuple @@ -8,8 +8,8 @@ ['offset', 'signature', 'start_line', 'end_line', 'offset_comment', 'module', 'is_template', 'is_stub']) -OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', - 'is_template', 'is_stub']) +OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template', + 'is_stub', 'comment']) # This has not been formally established, but considering that "STUB" # is a temporary state for a function, we assume it will appear last, @@ -74,4 +74,23 @@ def match_offset_comment(line: str) -> OffsetMatch | None: return OffsetMatch(module=match.group(1), address=int(match.group(2), 16), is_template=match.group(3) is not None, - is_stub=match.group(4) is not None) + is_stub=match.group(4) is not None, + comment=line.strip()) + + +def distinct_module(offsets: [OffsetMatch]) -> [OffsetMatch]: + """Given a list of offset markers, return a list with distinct + module names. If module names (case-insensitive) are repeated, + choose the offset that appears first.""" + + if len(offsets) < 2: + return offsets + + # Dict maintains insertion order in python >=3.7 + offsets_dict = {} + for offset in offsets: + module_upper = offset.module.upper() + if module_upper not in offsets_dict: + offsets_dict[module_upper] = offset + + return list(offsets_dict.values()) diff --git a/tools/isledecomp/tests/samples/multiple_offsets.cpp b/tools/isledecomp/tests/samples/multiple_offsets.cpp new file mode 100644 index 00000000..eecdd95b --- /dev/null +++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp @@ -0,0 +1,25 @@ +// Sample for python unit tests +// Not part of the decomp + +// Handling multiple offset markers + +// OFFSET: TEST 0x1234 +// OFFSET: HELLO 0x5555 +void different_modules() +{ + // TODO +} + +// OFFSET: TEST 0x2345 +// OFFSET: TEST 0x1234 +void same_module() +{ + // TODO +} + +// OFFSET: TEST 0x2002 +// OFFSET: test 0x1001 +void same_case_insensitive() +{ + // TODO +} diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py index 5b7a59ff..a55bf549 100644 --- a/tools/isledecomp/tests/test_parser.py +++ b/tools/isledecomp/tests/test_parser.py @@ -89,11 +89,12 @@ def test_indented(): assert len(blocks) == 2 assert blocks[0].offset == int('0x12345678', 16) assert blocks[0].start_line == 15 - #assert blocks[0].end_line == 18 + # assert blocks[0].end_line == 18 assert blocks[1].offset == int('0xdeadbeef', 16) assert blocks[1].start_line == 22 - #assert blocks[1].end_line == 24 + # assert blocks[1].end_line == 24 + def test_inline(): with sample_file('inline.cpp') as f: @@ -103,3 +104,25 @@ def test_inline(): for block in blocks: assert block.start_line is not None assert block.start_line == block.end_line + + +def test_multiple_offsets(): + """If multiple offset marks appear before for a code block, take them + all but ensure module name (case-insensitive) is distinct. + Use first module occurrence in case of duplicates.""" + with sample_file('multiple_offsets.cpp') as f: + blocks = find_code_blocks(f) + + assert len(blocks) == 4 + assert blocks[0].module == 'TEST' + assert blocks[0].start_line == 9 + + assert blocks[1].module == 'HELLO' + assert blocks[1].start_line == 9 + + # Duplicate modules are ignored + assert blocks[2].start_line == 16 + assert blocks[2].offset == 0x2345 + + assert blocks[3].module == 'TEST' + assert blocks[3].offset == 0x2002 diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py index 33a98420..4030c56a 100644 --- a/tools/isledecomp/tests/test_parser_util.py +++ b/tools/isledecomp/tests/test_parser_util.py @@ -1,8 +1,10 @@ import pytest +from collections import namedtuple from isledecomp.parser.util import ( is_blank_or_comment, match_offset_comment, is_exact_offset_comment, + distinct_module, ) @@ -85,3 +87,26 @@ def test_offset_match(line: str, match: bool, exact): @pytest.mark.parametrize('match, exact, line', offset_comment_samples) def test_exact_offset_comment(line: str, exact: bool, match): assert is_exact_offset_comment(line) is exact + + +# Helper for the next test: cut down version of OffsetMatch +MiniOfs = namedtuple('MiniOfs', ['module', 'value']) + +distinct_module_samples = [ + # empty set + ([], []), + # same module name + ([MiniOfs('TEST', 123), MiniOfs('TEST', 555)], + [MiniOfs('TEST', 123)]), + # same module name, case-insensitive + ([MiniOfs('test', 123), MiniOfs('TEST', 555)], + [MiniOfs('test', 123)]), + # duplicates, non-consecutive + ([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)], + [MiniOfs('test', 123), MiniOfs('abc', 111)]), +] + + +@pytest.mark.parametrize('sample, expected', distinct_module_samples) +def test_distinct_module(sample: [MiniOfs], expected: [MiniOfs]): + assert distinct_module(sample) == expected