Refactor of state machine to prepare for reccmp integration

2026-01-23 16:21:15 +00:00 · 2023-10-31 15:41:23 -04:00 · 2023-10-31 15:41:23 -04:00 · 7b61b33c98
commit 7b61b33c98
parent cb7e223b63
7 changed files with 240 additions and 55 deletions
--- a/tools/checkorder/checkorder.py
+++ b/tools/checkorder/checkorder.py
@ -2,7 +2,10 @@
 import sys
 import argparse
 from typing import TextIO
-from isledecomp.dir import walk_source_dir
+from isledecomp.dir import (
+    walk_source_dir,
+    file_is_cpp
+)
 from isledecomp.parser import find_code_blocks
 from isledecomp.parser.util import (
    is_exact_offset_comment
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@ -7,6 +7,8 @@
    is_blank_or_comment,
    match_offset_comment,
    is_exact_offset_comment,
+    template_function_name,
+    remove_trailing_comment,
 )


@ -14,6 +16,9 @@ class ReaderState(Enum):
    WANT_OFFSET = 0
    WANT_SIG = 1
    IN_FUNC = 2
+    IN_TEMPLATE = 3
+    WANT_CURLY = 4
+    FUNCTION_DONE = 5


 def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
@ -25,53 +30,106 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:

    blocks = []

-    offset = None
+    offset_match = None
    offset_comment = None
    function_sig = None
    start_line = None
+    end_line = None
    state = ReaderState.WANT_OFFSET

-    for line_no, line in enumerate(stream):
-        if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
+    # 1-based to match cvdump and your text-editor
+    # I know it says 0, but we will increment before each readline()
+    line_no = 0
+    can_seek = True
+
+    while True:
+        # Do this before reading again so that an EOF will not
+        # cause us to miss the last function of the file.
+        if state == ReaderState.FUNCTION_DONE:
+            block = CodeBlock(offset=offset_match.address,
+                              signature=function_sig,
+                              start_line=start_line,
+                              end_line=end_line,
+                              offset_comment=offset_comment,
+                              module=offset_match.module,
+                              is_template=offset_match.is_template,
+                              is_stub=offset_match.is_stub)
+            blocks.append(block)
+            state = ReaderState.WANT_OFFSET
+
+        if can_seek:
+            line_no += 1
+            line = stream.readline()
+            if line == '':
+                break
+
+        if (state != ReaderState.WANT_OFFSET and
+                match_offset_comment(line) is not None):
+            # We hit another offset unexpectedly.
+            # We can recover easily by just ending the function here.
+            end_line = line_no - 1
+            state = ReaderState.FUNCTION_DONE
+
+            # Pause reading here so we handle the offset marker
+            # on the next loop iteration
+            can_seek = False
+
+        # Regular state machine handling begins now
+        if state == ReaderState.IN_TEMPLATE:
+            # TEMPLATE functions are a special case. The signature is
+            # given on the next line (in a // comment)
+            function_sig = template_function_name(line)
+            start_line = line_no
+            end_line = line_no
+            state = ReaderState.FUNCTION_DONE
+
+        elif state == ReaderState.WANT_SIG:
+            # Skip blank lines or comments that come after the offset
+            # marker. There is not a formal procedure for this, so just
+            # assume the next "code line" is the function signature
+            if not is_blank_or_comment(line):
+                function_sig = remove_trailing_comment(line.strip())
+
+                # Now check to see if the opening curly bracket is on the
+                # same line. clang-format should prevent this (BraceWrapping)
+                # but it is easy to detect.
+                # If the entire function is on one line, we can handle that
+                # too, although this should be limited to inlines.
+                if function_sig.endswith('{'):
+                    start_line = line_no
+                    state = ReaderState.IN_FUNC
+                elif (function_sig.endswith('}') or
+                        function_sig.endswith('};')):
+                    start_line = line_no
+                    end_line = line_no
+                    state = ReaderState.FUNCTION_DONE
+                else:
+                    state = ReaderState.WANT_CURLY
+
+        elif state == ReaderState.WANT_CURLY:
+            if line.strip() == '{':
+                start_line = line_no
+                state = ReaderState.IN_FUNC
+
+        elif state == ReaderState.IN_FUNC:
            # Naive but reasonable assumption that functions will end with
            # a curly brace on its own line with no prepended spaces.
            if line.startswith('}'):
-                # TODO: could streamline this and the next case
-                block = CodeBlock(offset=offset,
-                                  signature=function_sig,
-                                  start_line=start_line,
-                                  end_line=line_no,
-                                  offset_comment=offset_comment)
+                end_line = line_no
+                state = ReaderState.FUNCTION_DONE

-                blocks.append(block)
-                state = ReaderState.WANT_OFFSET
-            elif match_offset_comment(line) is not None:
-                # We hit another offset unexpectedly before detecting the
-                # end of the function. We can recover easily by just
-                # ending the function here.
-                block = CodeBlock(offset=offset,
-                                  signature=function_sig,
-                                  start_line=start_line,
-                                  end_line=line_no - 1,
-                                  offset_comment=offset_comment)
-
-                blocks.append(block)
-                state = ReaderState.WANT_OFFSET
-
-            # We want to grab the function signature so we can identify
-            # the code block. Skip any blank lines or comments
-            # that follow the offset comment.
-            elif (not is_blank_or_comment(line)
-                  and state == ReaderState.WANT_SIG):
-                function_sig = line.strip()
-                state = ReaderState.IN_FUNC
-
-        if state == ReaderState.WANT_OFFSET:
+        elif state == ReaderState.WANT_OFFSET:
+            # If we detected an offset marker unexpectedly, we are handling
+            # it here so we can continue seeking.
+            can_seek = True
            match = match_offset_comment(line)
            if match is not None:
-                offset = int(match, 16)
+                offset_match = match
                offset_comment = line.strip()
-                start_line = line_no
-                state = ReaderState.WANT_SIG
+
+                if match.is_template:
+                    state = ReaderState.IN_TEMPLATE
+                else:
+                    state = ReaderState.WANT_SIG

    return blocks
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@ -5,17 +5,47 @@

 CodeBlock = namedtuple('CodeBlock',
                       ['offset', 'signature', 'start_line', 'end_line',
-                        'offset_comment'])
+                        'offset_comment', 'module', 'is_template', 'is_stub'])

+OffsetMatch = namedtuple('OffsetMatch', ['module', 'address',
+                                         'is_template', 'is_stub'])
+
+# This has not been formally established, but considering that "STUB"
+# is a temporary state for a function, we assume it will appear last,
+# after any other modifiers (i.e. TEMPLATE)

 # To match a reasonable variance of formatting for the offset comment
-offsetCommentRegex = re.compile(r'//\s*OFFSET:\s*\w+\s+(?:0x)?([a-f0-9]+)(\s+STUB)?',
+offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?',  # nopep8
                                flags=re.I)

 # To match the exact syntax (text upper case, hex lower case, with spaces)
 # that is used in most places
-# TODO: template and stub mutually exclusive?
-offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)(?: STUB| TEMPLATE)?$')
+offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$')  # nopep8
+
+
+# The goal here is to just read whatever is on the next line, so some
+# flexibility in the formatting seems OK
+templateCommentRegex = re.compile(r'\s*//\s+(.*)')
+
+
+# To remove any comment (//) or block comment (/*) and its leading spaces
+# from the end of a code line
+trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
+
+
+def template_function_name(line: str) -> str:
+    """Parse function signature for special TEMPLATE functions"""
+    template_match = templateCommentRegex.match(line)
+
+    # If we don't match, you get whatever is on the line as the signature
+    if template_match is not None:
+        return template_match.group(1)
+    else:
+        return line
+
+
+def remove_trailing_comment(line: str) -> str:
+    return trailingCommentRegex.sub('', line)


 def is_blank_or_comment(line: str) -> bool:
@ -35,11 +65,12 @@ def is_exact_offset_comment(line: str) -> bool:
    return offsetCommentExactRegex.match(line) is not None


-def match_offset_comment(line: str) -> str | None:
-    # TODO: intended to skip the expensive regex match, but is it necessary?
-    # TODO: this will skip indented offsets
-    if not line.startswith('//'):
+def match_offset_comment(line: str) -> OffsetMatch | None:
+    match = offsetCommentRegex.match(line)
+    if match is None:
        return None

-    match = offsetCommentRegex.match(line)
-    return match.group(1) if match is not None else None
+    return OffsetMatch(module=match.group(1),
+                       address=int(match.group(2), 16),
+                       is_template=match.group(3) is not None,
+                       is_stub=match.group(4) is not None)
--- a/tools/isledecomp/tests/samples/basic_class.cpp
+++ b/tools/isledecomp/tests/samples/basic_class.cpp
@ -0,0 +1,29 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// A very simple class
+
+class TestClass {
+public:
+  TestClass();
+  virtual ~TestClass() override;
+
+  virtual MxResult Tickle() override; // vtable+08
+
+  // OFFSET: LEGO1 0x12345678
+  inline const char* ClassName() const // vtable+0c
+  {
+    // 0xabcd1234
+    return "TestClass";
+  }
+
+  // OFFSET: LEGO1 0xdeadbeef
+  inline MxBool IsA(const char* name) const override // vtable+10
+  {
+    return !strcmp(name, TestClass::ClassName());
+  }
+
+private:
+  int m_hello;
+  int m_hiThere;
+};
--- a/tools/isledecomp/tests/samples/inline.cpp
+++ b/tools/isledecomp/tests/samples/inline.cpp
@ -0,0 +1,8 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// OFFSET: LEGO1 0x10000001
+inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
+
+// OFFSET: LEGO1 0x10000002
+inline const char* OneLine() const { return "MxDSObject"; };
--- a/tools/isledecomp/tests/samples/poorly_formatted.cpp
+++ b/tools/isledecomp/tests/samples/poorly_formatted.cpp
@ -0,0 +1,23 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// While it's reasonable to expect a well-formed file (and clang-format
+// will make sure we get one), this will put the parser through its paces.
+
+// OFFSET: TEST 0x1234
+void curly_with_spaces()
+  {
+  static char* msg = "hello";
+  }
+
+// OFFSET: TEST 0x5555
+void weird_closing_curly()
+{
+  int x = 123; }
+
+// OFFSET: HELLO 0x5656
+void bad_indenting() {
+  if (0)
+{
+  int y = 5;
+}}
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@ -30,9 +30,10 @@ def test_sanity():

    assert len(blocks) == 3
    assert code_blocks_are_sorted(blocks) is True
-    # n.b. The parser returns line numbers as 0-based
-    assert blocks[0].start_line == 5
-    assert blocks[0].end_line == 9
+    # n.b. The parser returns line numbers as 1-based
+    # Function starts when we see the opening curly brace
+    assert blocks[0].start_line == 8
+    assert blocks[0].end_line == 10


 def test_oneline():
@ -42,11 +43,7 @@ def test_oneline():
        blocks = find_code_blocks(f)

    assert len(blocks) == 2
-    assert blocks[0].start_line == 3
-    # TODO: Because of the way it works now, this captures the blank line
-    # as part of the function. That's not *incorrect* per se, but
-    # this needs to be more consistent if we want the tool to sort the
-    # code blocks in the file.
+    assert blocks[0].start_line == 5
    assert blocks[0].end_line == 5


@ -70,3 +67,39 @@ def test_jumbled_case():

    assert len(blocks) == 3
    assert code_blocks_are_sorted(blocks) is False
+
+
+def test_bad_file():
+    with sample_file('poorly_formatted.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 3
+
+
+def test_indented():
+    """Offsets for functions inside of a class will probably be indented."""
+    with sample_file('basic_class.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    # TODO: We don't properly detect the end of these functions
+    # because the closing brace is indented. However... knowing where each
+    # function ends is less important (for now) than capturing
+    # all the functions that are there.
+
+    assert len(blocks) == 2
+    assert blocks[0].offset == int('0x12345678', 16)
+    assert blocks[0].start_line == 15
+    #assert blocks[0].end_line == 18
+
+    assert blocks[1].offset == int('0xdeadbeef', 16)
+    assert blocks[1].start_line == 22
+    #assert blocks[1].end_line == 24
+
+def test_inline():
+    with sample_file('inline.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 2
+    for block in blocks:
+        assert block.start_line is not None
+        assert block.start_line == block.end_line