Allow multiple offset markers, pep8 cleanup

2026-01-24 08:41:16 +00:00 · 2023-11-14 22:25:37 -05:00 · 2023-11-14 22:25:37 -05:00 · 09688ed83f
commit 09688ed83f
parent f734d2733d
5 changed files with 142 additions and 47 deletions
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@ -10,6 +10,7 @@
    is_exact_offset_comment,
    template_function_name,
    remove_trailing_comment,
+    distinct_module,
 )


@ -31,11 +32,8 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:

    blocks = []

-    offset_match = OffsetMatch(module=None,
-                               address=None,
-                               is_template=None,
-                               is_stub=None)
-    offset_comment = None
+    offset_matches = []
+
    function_sig = None
    start_line = None
    end_line = None
@ -50,15 +48,19 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
        # Do this before reading again so that an EOF will not
        # cause us to miss the last function of the file.
        if state == ReaderState.FUNCTION_DONE:
-            block = CodeBlock(offset=offset_match.address,
-                              signature=function_sig,
-                              start_line=start_line,
-                              end_line=end_line,
-                              offset_comment=offset_comment,
-                              module=offset_match.module,
-                              is_template=offset_match.is_template,
-                              is_stub=offset_match.is_stub)
-            blocks.append(block)
+            # Our list of offset marks could have duplicates on
+            # module name, so we'll eliminate those now.
+            for offset_match in distinct_module(offset_matches):
+                block = CodeBlock(offset=offset_match.address,
+                                  signature=function_sig,
+                                  start_line=start_line,
+                                  end_line=end_line,
+                                  offset_comment=offset_match.comment,
+                                  module=offset_match.module,
+                                  is_template=offset_match.is_template,
+                                  is_stub=offset_match.is_stub)
+                blocks.append(block)
+            offset_matches = []
            state = ReaderState.WANT_OFFSET

        if can_seek:
@ -67,19 +69,33 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
            if line == '':
                break

-        if (state != ReaderState.WANT_OFFSET and
-                match_offset_comment(line) is not None):
-            # We hit another offset unexpectedly.
-            # We can recover easily by just ending the function here.
-            end_line = line_no - 1
-            state = ReaderState.FUNCTION_DONE
+        new_match = match_offset_comment(line)
+        if new_match is not None:
+            # We will allow multiple offsets if we have just begun
+            # the code block, but not after we hit the curly brace.
+            if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
+                         ReaderState.WANT_SIG):
+                # If we detected an offset marker unexpectedly,
+                # we are handling it here so we can continue seeking.
+                can_seek = True

-            # Pause reading here so we handle the offset marker
-            # on the next loop iteration
-            can_seek = False
+                offset_matches.append(new_match)

-        # Regular state machine handling begins now
-        if state == ReaderState.IN_TEMPLATE:
+                if new_match.is_template:
+                    state = ReaderState.IN_TEMPLATE
+                else:
+                    state = ReaderState.WANT_SIG
+            else:
+                # We hit another offset unexpectedly.
+                # We can recover easily by just ending the function here.
+                end_line = line_no - 1
+                state = ReaderState.FUNCTION_DONE
+
+                # Pause reading here so we handle the offset marker
+                # on the next loop iteration
+                can_seek = False
+
+        elif state == ReaderState.IN_TEMPLATE:
            # TEMPLATE functions are a special case. The signature is
            # given on the next line (in a // comment)
            function_sig = template_function_name(line)
@ -92,13 +108,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
            # marker. There is not a formal procedure for this, so just
            # assume the next "code line" is the function signature
            if not is_blank_or_comment(line):
+                # Inline functions may end with a comment. Strip that out
+                # to help parsing.
                function_sig = remove_trailing_comment(line.strip())

                # Now check to see if the opening curly bracket is on the
                # same line. clang-format should prevent this (BraceWrapping)
                # but it is easy to detect.
-                # If the entire function is on one line, we can handle that
-                # too, although this should be limited to inlines.
+                # If the entire function is on one line, handle that too.
                if function_sig.endswith('{'):
                    start_line = line_no
                    state = ReaderState.IN_FUNC
@ -122,18 +139,4 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
                end_line = line_no
                state = ReaderState.FUNCTION_DONE

-        elif state == ReaderState.WANT_OFFSET:
-            # If we detected an offset marker unexpectedly, we are handling
-            # it here so we can continue seeking.
-            can_seek = True
-            match = match_offset_comment(line)
-            if match is not None:
-                offset_match = match
-                offset_comment = line.strip()
-
-                if match.is_template:
-                    state = ReaderState.IN_TEMPLATE
-                else:
-                    state = ReaderState.WANT_SIG
-
    return blocks
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@ -1,5 +1,5 @@
 # C++ Parser utility functions and data structures
-from __future__ import annotations # python <3.10 compatibility
+from __future__ import annotations  # python <3.10 compatibility
 import re
 from collections import namedtuple

@ -8,8 +8,8 @@
                       ['offset', 'signature', 'start_line', 'end_line',
                        'offset_comment', 'module', 'is_template', 'is_stub'])

-OffsetMatch = namedtuple('OffsetMatch', ['module', 'address',
-                                         'is_template', 'is_stub'])
+OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
+                                         'is_stub', 'comment'])

 # This has not been formally established, but considering that "STUB"
 # is a temporary state for a function, we assume it will appear last,
@ -74,4 +74,23 @@ def match_offset_comment(line: str) -> OffsetMatch | None:
    return OffsetMatch(module=match.group(1),
                       address=int(match.group(2), 16),
                       is_template=match.group(3) is not None,
-                       is_stub=match.group(4) is not None)
+                       is_stub=match.group(4) is not None,
+                       comment=line.strip())
+
+
+def distinct_module(offsets: [OffsetMatch]) -> [OffsetMatch]:
+    """Given a list of offset markers, return a list with distinct
+       module names. If module names (case-insensitive) are repeated,
+       choose the offset that appears first."""
+
+    if len(offsets) < 2:
+        return offsets
+
+    # Dict maintains insertion order in python >=3.7
+    offsets_dict = {}
+    for offset in offsets:
+        module_upper = offset.module.upper()
+        if module_upper not in offsets_dict:
+            offsets_dict[module_upper] = offset
+
+    return list(offsets_dict.values())
--- a/tools/isledecomp/tests/samples/multiple_offsets.cpp
+++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp
@ -0,0 +1,25 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// Handling multiple offset markers
+
+// OFFSET: TEST 0x1234
+// OFFSET: HELLO 0x5555
+void different_modules()
+{
+  // TODO
+}
+
+// OFFSET: TEST 0x2345
+// OFFSET: TEST 0x1234
+void same_module()
+{
+  // TODO
+}
+
+// OFFSET: TEST 0x2002
+// OFFSET: test 0x1001
+void same_case_insensitive()
+{
+  // TODO
+}
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@ -89,11 +89,12 @@ def test_indented():
    assert len(blocks) == 2
    assert blocks[0].offset == int('0x12345678', 16)
    assert blocks[0].start_line == 15
-    #assert blocks[0].end_line == 18
+    # assert blocks[0].end_line == 18

    assert blocks[1].offset == int('0xdeadbeef', 16)
    assert blocks[1].start_line == 22
-    #assert blocks[1].end_line == 24
+    # assert blocks[1].end_line == 24
+

 def test_inline():
    with sample_file('inline.cpp') as f:
@ -103,3 +104,25 @@ def test_inline():
    for block in blocks:
        assert block.start_line is not None
        assert block.start_line == block.end_line
+
+
+def test_multiple_offsets():
+    """If multiple offset marks appear before for a code block, take them
+       all but ensure module name (case-insensitive) is distinct.
+       Use first module occurrence in case of duplicates."""
+    with sample_file('multiple_offsets.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 4
+    assert blocks[0].module == 'TEST'
+    assert blocks[0].start_line == 9
+
+    assert blocks[1].module == 'HELLO'
+    assert blocks[1].start_line == 9
+
+    # Duplicate modules are ignored
+    assert blocks[2].start_line == 16
+    assert blocks[2].offset == 0x2345
+
+    assert blocks[3].module == 'TEST'
+    assert blocks[3].offset == 0x2002
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@ -1,8 +1,10 @@
 import pytest
+from collections import namedtuple
 from isledecomp.parser.util import (
    is_blank_or_comment,
    match_offset_comment,
    is_exact_offset_comment,
+    distinct_module,
 )


@ -85,3 +87,26 @@ def test_offset_match(line: str, match: bool, exact):
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
 def test_exact_offset_comment(line: str, exact: bool, match):
    assert is_exact_offset_comment(line) is exact
+
+
+# Helper for the next test: cut down version of OffsetMatch
+MiniOfs = namedtuple('MiniOfs', ['module', 'value'])
+
+distinct_module_samples = [
+    # empty set
+    ([], []),
+    # same module name
+    ([MiniOfs('TEST', 123), MiniOfs('TEST', 555)],
+     [MiniOfs('TEST', 123)]),
+    # same module name, case-insensitive
+    ([MiniOfs('test', 123), MiniOfs('TEST', 555)],
+     [MiniOfs('test', 123)]),
+    # duplicates, non-consecutive
+    ([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)],
+     [MiniOfs('test', 123), MiniOfs('abc', 111)]),
+]
+
+
+@pytest.mark.parametrize('sample, expected', distinct_module_samples)
+def test_distinct_module(sample: [MiniOfs], expected: [MiniOfs]):
+    assert distinct_module(sample) == expected