Allow multiple offset markers, pep8 cleanup

2026-01-27 18:21:15 +00:00 · 2023-11-14 22:25:37 -05:00 · 2023-11-14 22:25:37 -05:00 · 09688ed83f
commit 09688ed83f
parent f734d2733d
5 changed files with 142 additions and 47 deletions
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@ -10,6 +10,7 @@
    is_exact_offset_comment,
    template_function_name,
    remove_trailing_comment,
    distinct_module,
 )
@ -31,11 +32,8 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
    blocks = []
-    offset_match = OffsetMatch(module=None,
+    offset_matches = []
-                               address=None,
+
                               is_template=None,
                               is_stub=None)
    offset_comment = None
    function_sig = None
    start_line = None
    end_line = None
@ -50,15 +48,19 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
        # Do this before reading again so that an EOF will not
        # cause us to miss the last function of the file.
        if state == ReaderState.FUNCTION_DONE:
-            block = CodeBlock(offset=offset_match.address,
+            # Our list of offset marks could have duplicates on
-                              signature=function_sig,
+            # module name, so we'll eliminate those now.
-                              start_line=start_line,
+            for offset_match in distinct_module(offset_matches):
-                              end_line=end_line,
+                block = CodeBlock(offset=offset_match.address,
-                              offset_comment=offset_comment,
+                                  signature=function_sig,
-                              module=offset_match.module,
+                                  start_line=start_line,
-                              is_template=offset_match.is_template,
+                                  end_line=end_line,
-                              is_stub=offset_match.is_stub)
+                                  offset_comment=offset_match.comment,
-            blocks.append(block)
+                                  module=offset_match.module,
                                  is_template=offset_match.is_template,
                                  is_stub=offset_match.is_stub)
                blocks.append(block)
            offset_matches = []
            state = ReaderState.WANT_OFFSET
        if can_seek:
@ -67,19 +69,33 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
            if line == '':
                break
-        if (state != ReaderState.WANT_OFFSET and
+        new_match = match_offset_comment(line)
-                match_offset_comment(line) is not None):
+        if new_match is not None:
-            # We hit another offset unexpectedly.
+            # We will allow multiple offsets if we have just begun
-            # We can recover easily by just ending the function here.
+            # the code block, but not after we hit the curly brace.
-            end_line = line_no - 1
+            if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
-            state = ReaderState.FUNCTION_DONE
+                         ReaderState.WANT_SIG):
                # If we detected an offset marker unexpectedly,
                # we are handling it here so we can continue seeking.
                can_seek = True
-            # Pause reading here so we handle the offset marker
+                offset_matches.append(new_match)
            # on the next loop iteration
            can_seek = False
-        # Regular state machine handling begins now
+                if new_match.is_template:
-        if state == ReaderState.IN_TEMPLATE:
+                    state = ReaderState.IN_TEMPLATE
                else:
                    state = ReaderState.WANT_SIG
            else:
                # We hit another offset unexpectedly.
                # We can recover easily by just ending the function here.
                end_line = line_no - 1
                state = ReaderState.FUNCTION_DONE
                # Pause reading here so we handle the offset marker
                # on the next loop iteration
                can_seek = False
        elif state == ReaderState.IN_TEMPLATE:
            # TEMPLATE functions are a special case. The signature is
            # given on the next line (in a // comment)
            function_sig = template_function_name(line)
@ -92,13 +108,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
            # marker. There is not a formal procedure for this, so just
            # assume the next "code line" is the function signature
            if not is_blank_or_comment(line):
                # Inline functions may end with a comment. Strip that out
                # to help parsing.
                function_sig = remove_trailing_comment(line.strip())
                # Now check to see if the opening curly bracket is on the
                # same line. clang-format should prevent this (BraceWrapping)
                # but it is easy to detect.
-                # If the entire function is on one line, we can handle that
+                # If the entire function is on one line, handle that too.
                # too, although this should be limited to inlines.
                if function_sig.endswith('{'):
                    start_line = line_no
                    state = ReaderState.IN_FUNC
@ -122,18 +139,4 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
                end_line = line_no
                state = ReaderState.FUNCTION_DONE
        elif state == ReaderState.WANT_OFFSET:
            # If we detected an offset marker unexpectedly, we are handling
            # it here so we can continue seeking.
            can_seek = True
            match = match_offset_comment(line)
            if match is not None:
                offset_match = match
                offset_comment = line.strip()
                if match.is_template:
                    state = ReaderState.IN_TEMPLATE
                else:
                    state = ReaderState.WANT_SIG
    return blocks
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@ -1,5 +1,5 @@
 # C++ Parser utility functions and data structures
-from __future__ import annotations # python <3.10 compatibility
+from __future__ import annotations  # python <3.10 compatibility
 import re
 from collections import namedtuple
@ -8,8 +8,8 @@
                       ['offset', 'signature', 'start_line', 'end_line',
                        'offset_comment', 'module', 'is_template', 'is_stub'])
-OffsetMatch = namedtuple('OffsetMatch', ['module', 'address',
+OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
-                                         'is_template', 'is_stub'])
+                                         'is_stub', 'comment'])
 # This has not been formally established, but considering that "STUB"
 # is a temporary state for a function, we assume it will appear last,
@ -74,4 +74,23 @@ def match_offset_comment(line: str) -> OffsetMatch | None:
    return OffsetMatch(module=match.group(1),
                       address=int(match.group(2), 16),
                       is_template=match.group(3) is not None,
-                       is_stub=match.group(4) is not None)
+                       is_stub=match.group(4) is not None,
                       comment=line.strip())
 def distinct_module(offsets: [OffsetMatch]) -> [OffsetMatch]:
    """Given a list of offset markers, return a list with distinct
       module names. If module names (case-insensitive) are repeated,
       choose the offset that appears first."""
    if len(offsets) < 2:
        return offsets
    # Dict maintains insertion order in python >=3.7
    offsets_dict = {}
    for offset in offsets:
        module_upper = offset.module.upper()
        if module_upper not in offsets_dict:
            offsets_dict[module_upper] = offset
    return list(offsets_dict.values())
--- a/tools/isledecomp/tests/samples/multiple_offsets.cpp
+++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp
@ -0,0 +1,25 @@
 // Sample for python unit tests
 // Not part of the decomp
 // Handling multiple offset markers
 // OFFSET: TEST 0x1234
 // OFFSET: HELLO 0x5555
 void different_modules()
 {
  // TODO
 }
 // OFFSET: TEST 0x2345
 // OFFSET: TEST 0x1234
 void same_module()
 {
  // TODO
 }
 // OFFSET: TEST 0x2002
 // OFFSET: test 0x1001
 void same_case_insensitive()
 {
  // TODO
 }
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@ -89,11 +89,12 @@ def test_indented():
    assert len(blocks) == 2
    assert blocks[0].offset == int('0x12345678', 16)
    assert blocks[0].start_line == 15
-    #assert blocks[0].end_line == 18
+    # assert blocks[0].end_line == 18
    assert blocks[1].offset == int('0xdeadbeef', 16)
    assert blocks[1].start_line == 22
-    #assert blocks[1].end_line == 24
+    # assert blocks[1].end_line == 24
 def test_inline():
    with sample_file('inline.cpp') as f:
@ -103,3 +104,25 @@ def test_inline():
    for block in blocks:
        assert block.start_line is not None
        assert block.start_line == block.end_line
 def test_multiple_offsets():
    """If multiple offset marks appear before for a code block, take them
       all but ensure module name (case-insensitive) is distinct.
       Use first module occurrence in case of duplicates."""
    with sample_file('multiple_offsets.cpp') as f:
        blocks = find_code_blocks(f)
    assert len(blocks) == 4
    assert blocks[0].module == 'TEST'
    assert blocks[0].start_line == 9
    assert blocks[1].module == 'HELLO'
    assert blocks[1].start_line == 9
    # Duplicate modules are ignored
    assert blocks[2].start_line == 16
    assert blocks[2].offset == 0x2345
    assert blocks[3].module == 'TEST'
    assert blocks[3].offset == 0x2002
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@ -1,8 +1,10 @@
 import pytest
 from collections import namedtuple
 from isledecomp.parser.util import (
    is_blank_or_comment,
    match_offset_comment,
    is_exact_offset_comment,
    distinct_module,
 )
@ -85,3 +87,26 @@ def test_offset_match(line: str, match: bool, exact):
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
 def test_exact_offset_comment(line: str, exact: bool, match):
    assert is_exact_offset_comment(line) is exact
 # Helper for the next test: cut down version of OffsetMatch
 MiniOfs = namedtuple('MiniOfs', ['module', 'value'])
 distinct_module_samples = [
    # empty set
    ([], []),
    # same module name
    ([MiniOfs('TEST', 123), MiniOfs('TEST', 555)],
     [MiniOfs('TEST', 123)]),
    # same module name, case-insensitive
    ([MiniOfs('test', 123), MiniOfs('TEST', 555)],
     [MiniOfs('test', 123)]),
    # duplicates, non-consecutive
    ([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)],
     [MiniOfs('test', 123), MiniOfs('abc', 111)]),
 ]
@pytest.mark.parametrize('sample, expected', distinct_module_samples)
 def test_distinct_module(sample: [MiniOfs], expected: [MiniOfs]):
    assert distinct_module(sample) == expected