From 75802101accb5404e4b21315596c6b81a9931f15 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 15:10:32 -0500
Subject: [PATCH 01/17] Merge from parser2 branch

---
 tools/checkorder/checkorder.py                |  34 +-
 .../isledecomp/isledecomp/parser/__init__.py  |   2 +-
 tools/isledecomp/isledecomp/parser/error.py   |  33 ++
 tools/isledecomp/isledecomp/parser/node.py    |  41 ++
 tools/isledecomp/isledecomp/parser/parser.py  | 383 +++++++++++++-----
 tools/isledecomp/isledecomp/parser/util.py    |  70 +---
 .../isledecomp/tests/samples/basic_class.cpp  |   5 +-
 tools/isledecomp/tests/samples/basic_file.cpp |   6 +-
 .../tests/samples/global_variables.cpp        |  14 +
 tools/isledecomp/tests/samples/inline.cpp     |   4 +-
 .../tests/samples/missing_offset.cpp          |   2 +-
 .../tests/samples/multiple_offsets.cpp        |  12 +-
 .../tests/samples/oneline_function.cpp        |   4 +-
 .../isledecomp/tests/samples/out_of_order.cpp |   6 +-
 .../tests/samples/poorly_formatted.cpp        |   6 +-
 tools/isledecomp/tests/test_parser.py         | 235 ++++++-----
 tools/isledecomp/tests/test_parser_samples.py | 141 +++++++
 .../tests/test_parser_statechange.py          | 150 +++++++
 tools/isledecomp/tests/test_parser_util.py    | 107 +++--
 tools/reccmp/reccmp.py                        |  20 +-
 20 files changed, 923 insertions(+), 352 deletions(-)
 create mode 100644 tools/isledecomp/isledecomp/parser/error.py
 create mode 100644 tools/isledecomp/isledecomp/parser/node.py
 create mode 100644 tools/isledecomp/tests/samples/global_variables.cpp
 create mode 100644 tools/isledecomp/tests/test_parser_samples.py
 create mode 100644 tools/isledecomp/tests/test_parser_statechange.py

diff --git a/tools/checkorder/checkorder.py b/tools/checkorder/checkorder.py
index 1ac8391f..d2f0b23e 100644
--- a/tools/checkorder/checkorder.py
+++ b/tools/checkorder/checkorder.py
@@ -2,8 +2,7 @@
 import sys
 import argparse
 from isledecomp.dir import walk_source_dir, is_file_cpp
-from isledecomp.parser import find_code_blocks
-from isledecomp.parser.util import is_exact_offset_comment
+from isledecomp.parser import DecompParser
 
 
 def sig_truncate(sig: str) -> str:
@@ -16,23 +15,22 @@ def check_file(filename: str, verbose: bool = False) -> bool:
     """Open and read the given file, then check whether the code blocks
     are in order. If verbose, print each block."""
 
+    parser = DecompParser()
     with open(filename, "r", encoding="utf-8") as f:
-        code_blocks = find_code_blocks(f)
+        parser.read_lines(f)
 
-    bad_comments = [
-        (block.start_line, block.offset_comment)
-        for block in code_blocks
-        if not is_exact_offset_comment(block.offset_comment)
-    ]
 
-    just_offsets = [block.offset for block in code_blocks]
+    just_offsets = [block.offset for block in parser.functions]
     sorted_offsets = sorted(just_offsets)
     file_out_of_order = just_offsets != sorted_offsets
 
+    # TODO: When we add parser error severity, actual errors that obstruct
+    # parsing should probably be shown here regardless of verbose mode
+
     # If we detect inexact comments, don't print anything unless we are
     # in verbose mode. If the file is out of order, we always print the
     # file name.
-    should_report = (len(bad_comments) > 0 and verbose) or file_out_of_order
+    should_report = (len(parser.alerts) > 0 and verbose) or file_out_of_order
 
     if not should_report and not file_out_of_order:
         return False
@@ -44,22 +42,22 @@ def check_file(filename: str, verbose: bool = False) -> bool:
             order_lookup = {k: i for i, k in enumerate(sorted_offsets)}
             prev_offset = 0
 
-            for block in code_blocks:
+            for fun in parser.functions:
                 msg = " ".join(
                     [
-                        " " if block.offset > prev_offset else "!",
+                        " " if fun.offset > prev_offset else "!",
                         f"{block.offset:08x}",
-                        f"{block.end_line - block.start_line:4} lines",
-                        f"{order_lookup[block.offset]:3}",
+                        f"{fun.end_line - fun.line_number:4} lines",
+                        f"{order_lookup[fun.offset]:3}",
                         "    ",
-                        sig_truncate(block.signature),
+                        sig_truncate(fun.signature),
                     ]
                 )
                 print(msg)
-                prev_offset = block.offset
+                prev_offset = fun.offset
 
-        for line_no, line in bad_comments:
-            print(f"* line {line_no:3} bad offset comment ({line})")
+        for alert in parser.alerts:
+            print(f"* line {alert.line_number:4} {alert.code} ({alert.line})")
 
         print()
 
diff --git a/tools/isledecomp/isledecomp/parser/__init__.py b/tools/isledecomp/isledecomp/parser/__init__.py
index 0d504619..c9394d4a 100644
--- a/tools/isledecomp/isledecomp/parser/__init__.py
+++ b/tools/isledecomp/isledecomp/parser/__init__.py
@@ -1 +1 @@
-from .parser import find_code_blocks
+from .parser import DecompParser
diff --git a/tools/isledecomp/isledecomp/parser/error.py b/tools/isledecomp/isledecomp/parser/error.py
new file mode 100644
index 00000000..8bda90da
--- /dev/null
+++ b/tools/isledecomp/isledecomp/parser/error.py
@@ -0,0 +1,33 @@
+from enum import Enum
+
+
+class ParserError(Enum):
+    # WARN: Stub function exceeds some line number threshold
+    UNLIKELY_STUB = 100
+
+    # WARN: Decomp marker is close enough to be recognized, but does not follow syntax exactly
+    BAD_DECOMP_MARKER = 101
+
+    # WARN: Multiple markers in sequence do not have distinct modules
+    DUPLICATE_MODULE = 102
+
+    # WARN: Detected a dupcliate module/offset pair in the current file
+    DUPLICATE_OFFSET = 103
+
+    # WARN: We read a line that matches the decomp marker pattern, but we are not set up
+    # to handle it
+    BOGUS_MARKER = 104
+
+    # WARN: Under a synthetic marker we expected a comment but found a code line instead
+    SYNTHETIC_NOT_COMMENT = 110
+
+    # WARN: New function marker appeared while we were inside a function
+    MISSED_END_OF_FUNCTION = 117
+
+    # ERROR: We found a marker unexpectedly
+    UNEXPECTED_MARKER = 200
+
+    # ERROR: We found a marker where we expected to find one, but it is incompatible
+    # with the preceding markers.
+    # For example, a GLOBAL cannot follow FUNCTION/STUB
+    INCOMPATIBLE_MARKER = 201
diff --git a/tools/isledecomp/isledecomp/parser/node.py b/tools/isledecomp/isledecomp/parser/node.py
new file mode 100644
index 00000000..f9fbe3b5
--- /dev/null
+++ b/tools/isledecomp/isledecomp/parser/node.py
@@ -0,0 +1,41 @@
+from dataclasses import dataclass
+from enum import Enum
+
+
+@dataclass
+class ParserNode:
+    line_number: int
+
+
+@dataclass
+class ParserAlert(ParserNode):
+    code: int
+    line: str
+
+
+@dataclass
+class ParserSymbol(ParserNode):
+    module: str
+    offset: int
+
+
+@dataclass
+class ParserFunction(ParserSymbol):
+    name: str
+    is_stub: bool = False
+    is_synthetic: bool = False
+    is_template: bool = False
+    end_line: int = -1
+
+
+@dataclass
+class ParserVariable(ParserSymbol):
+    name: str
+    size: int = -1
+    is_static: bool = False
+
+
+@dataclass
+class ParserVtable(ParserSymbol):
+    class_name: str
+    num_entries: int = -1
diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
index 1039ac41..aebe1c86 100644
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -1,145 +1,346 @@
 # C++ file parser
 
-from typing import List, TextIO
+from typing import List, TextIO, Iterable
 from enum import Enum
 from .util import (
-    CodeBlock,
-    OffsetMatch,
+    DecompMarker,
     is_blank_or_comment,
-    match_offset_comment,
+    match_marker,
+    is_marker_exact,
     get_template_function_name,
     remove_trailing_comment,
-    distinct_by_module,
 )
+from .node import (
+    ParserAlert,
+    ParserNode,
+    ParserFunction,
+    ParserVariable,
+    ParserVtable,
+)
+from .error import ParserError
 
 
 class ReaderState(Enum):
-    WANT_OFFSET = 0
+    SEARCH = 0
     WANT_SIG = 1
     IN_FUNC = 2
     IN_TEMPLATE = 3
     WANT_CURLY = 4
-    FUNCTION_DONE = 5
+    IN_GLOBAL = 5
+    IN_FUNC_GLOBAL = 6
+    IN_VTABLE = 7
 
 
-def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
-    """Read the IO stream (file) line-by-line and give the following report:
-    Foreach code block (function) in the file, what are its starting and
-    ending line numbers, and what is the given offset in the original
-    binary. We expect the result to be ordered by line number because we
-    are reading the file from start to finish."""
+def marker_is_stub(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "STUB"
 
-    blocks: List[CodeBlock] = []
 
-    offset_matches: List[OffsetMatch] = []
+def marker_is_variable(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "GLOBAL"
 
-    function_sig = None
-    start_line = None
-    end_line = None
-    state = ReaderState.WANT_OFFSET
 
-    # 1-based to match cvdump and your text editor
-    # I know it says 0, but we will increment before each readline()
-    line_no = 0
-    can_seek = True
+def marker_is_synthetic(marker: DecompMarker) -> bool:
+    return marker.type.upper() in ("SYNTHETIC", "TEMPLATE")
 
-    while True:
-        # Do this before reading again so that an EOF will not
-        # cause us to miss the last function of the file.
-        if state == ReaderState.FUNCTION_DONE:
-            # Our list of offset marks could have duplicates on
-            # module name, so we'll eliminate those now.
-            for offset_match in distinct_by_module(offset_matches):
-                block = CodeBlock(
-                    offset=offset_match.address,
-                    signature=function_sig,
-                    start_line=start_line,
+
+def marker_is_function(marker: DecompMarker) -> bool:
+    return marker.type.upper() in ("FUNCTION", "STUB")
+
+
+def marker_is_vtable(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "VTABLE"
+
+
+class MarkerDict:
+    def __init__(self):
+        self.markers: dict = {}
+
+    def insert(self, marker: DecompMarker) -> bool:
+        module = marker.module.upper()
+        # Return True if this insert would overwrite
+        if module in self.markers:
+            return True
+
+        self.markers[module] = (marker.type, marker.offset)
+        return False
+
+    def iter(self):
+        for module in self.markers:
+            (marker_type, offset) = self.markers[module]
+            yield DecompMarker(marker_type, module, offset)
+
+    def empty(self):
+        self.markers = {}
+
+
+class DecompParser:
+    def __init__(self):
+        self.fun_markers = MarkerDict()
+        self.var_markers = MarkerDict()
+        self.tbl_markers = MarkerDict()
+        self.reset()
+
+    def reset(self):
+        # Output values
+        self.functions = []
+        self.vtables = []
+        self.variables = []
+        self.alerts = []
+
+        # Internal state machine stuff
+        self.line_number: int = 0
+        self.state: ReaderState = ReaderState.SEARCH
+
+        self.last_line: str = ""
+        self.fun_markers.empty()
+        self.var_markers.empty()
+        self.tbl_markers.empty()
+        self.function_start: int = 0
+        self.function_sig: str = ""
+
+    def _recover(self):
+        """We hit a syntax error and need to reset temp structures"""
+        self.state = ReaderState.SEARCH
+        self.fun_markers.empty()
+        self.var_markers.empty()
+        self.tbl_markers.empty()
+
+    def _syntax_warning(self, code):
+        self.alerts.append(
+            ParserAlert(
+                line_number=self.line_number,
+                code=code,
+                line=self.last_line.strip(),
+            )
+        )
+
+    def _syntax_error(self, code):
+        self._syntax_warning(code)
+        self._recover()
+
+    def _function_starts_here(self):
+        self.function_start = self.line_number
+
+    def _function_marker(self, marker: DecompMarker):
+        if self.fun_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+        self.state = ReaderState.WANT_SIG
+
+    def _synthetic_marker(self, marker: DecompMarker):
+        if self.fun_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+        self.state = ReaderState.IN_TEMPLATE
+
+    def _function_done(self, unexpected: bool = False):
+        end_line = self.line_number
+        if unexpected:
+            end_line -= -1
+
+        for marker in self.fun_markers.iter():
+            self.functions.append(
+                ParserFunction(
+                    line_number=self.function_start,
+                    module=marker.module,
+                    offset=marker.offset,
+                    is_stub=marker_is_stub(marker),
+                    is_template=marker_is_synthetic(marker),
+                    name=self.function_sig,
                     end_line=end_line,
-                    offset_comment=offset_match.comment,
-                    module=offset_match.module,
-                    is_template=offset_match.is_template,
-                    is_stub=offset_match.is_stub,
                 )
-                blocks.append(block)
-            offset_matches = []
-            state = ReaderState.WANT_OFFSET
+            )
 
-        if can_seek:
-            line_no += 1
-            line = stream.readline()
-            if line == "":
-                break
+        self.fun_markers.empty()
+        self.state = ReaderState.SEARCH
 
-        new_match = match_offset_comment(line)
-        if new_match is not None:
-            # We will allow multiple offsets if we have just begun
-            # the code block, but not after we hit the curly brace.
-            if state in (
-                ReaderState.WANT_OFFSET,
-                ReaderState.IN_TEMPLATE,
+    def _vtable_marker(self, marker: DecompMarker):
+        if self.tbl_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+        self.state = ReaderState.IN_VTABLE
+
+    def _vtable_done(self):
+        for marker in self.tbl_markers.iter():
+            self.vtables.append(
+                ParserVtable(
+                    line_number=self.line_number,
+                    module=marker.module,
+                    offset=marker.offset,
+                    class_name=self.last_line.strip(),
+                )
+            )
+
+        self.tbl_markers.empty()
+        self.state = ReaderState.SEARCH
+
+    def _variable_marker(self, marker: DecompMarker):
+        if self.var_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+
+        if self.state in (ReaderState.IN_FUNC, ReaderState.IN_FUNC_GLOBAL):
+            self.state = ReaderState.IN_FUNC_GLOBAL
+        else:
+            self.state = ReaderState.IN_GLOBAL
+
+    def _variable_done(self):
+        for marker in self.var_markers.iter():
+            self.variables.append(
+                ParserVariable(
+                    line_number=self.line_number,
+                    module=marker.module,
+                    offset=marker.offset,
+                    name=self.last_line.strip(),
+                )
+            )
+
+        self.var_markers.empty()
+        if self.state == ReaderState.IN_FUNC_GLOBAL:
+            self.state = ReaderState.IN_FUNC
+        else:
+            self.state = ReaderState.SEARCH
+
+    def _handle_marker(self, marker: DecompMarker):
+        # Cannot handle any markers between function sig and opening curly brace
+        if self.state == ReaderState.WANT_CURLY:
+            self._syntax_error(ParserError.UNEXPECTED_MARKER)
+            return
+
+        # TODO: How uncertain are we of detecting the end of a function
+        # in a clang-formatted file? For now we assume we have missed the
+        # end if we detect a non-GLOBAL marker while state is IN_FUNC.
+        # Maybe these cases should be syntax errors instead
+
+        if marker_is_function(marker):
+            if self.state in (
+                ReaderState.SEARCH,
                 ReaderState.WANT_SIG,
             ):
-                # If we detected an offset marker unexpectedly,
-                # we are handling it here so we can continue seeking.
-                can_seek = True
-
-                offset_matches.append(new_match)
-
-                if new_match.is_template:
-                    state = ReaderState.IN_TEMPLATE
-                else:
-                    state = ReaderState.WANT_SIG
-            else:
+                # We will allow multiple offsets if we have just begun
+                # the code block, but not after we hit the curly brace.
+                self._function_marker(marker)
+            elif self.state == ReaderState.IN_FUNC:
                 # We hit another offset unexpectedly.
                 # We can recover easily by just ending the function here.
-                end_line = line_no - 1
-                state = ReaderState.FUNCTION_DONE
+                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
+                self._function_done()
 
-                # Pause reading here so we handle the offset marker
-                # on the next loop iteration
-                can_seek = False
+                # Start the next function right after so we can
+                # read the next line.
+                self._function_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
 
-        elif state == ReaderState.IN_TEMPLATE:
+        elif marker_is_synthetic(marker):
+            if self.state in (ReaderState.SEARCH, ReaderState.IN_TEMPLATE):
+                self._synthetic_marker(marker)
+            elif self.state == ReaderState.IN_FUNC:
+                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
+                self._function_done()
+                self._synthetic_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
+
+        elif marker_is_variable(marker):
+            if self.state in (
+                ReaderState.SEARCH,
+                ReaderState.IN_GLOBAL,
+                ReaderState.IN_FUNC,
+                ReaderState.IN_FUNC_GLOBAL,
+            ):
+                self._variable_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
+
+        elif marker_is_vtable(marker):
+            if self.state in (ReaderState.SEARCH, ReaderState.IN_VTABLE):
+                self._vtable_marker(marker)
+            elif self.state == ReaderState.IN_FUNC:
+                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
+                self._function_done()
+                self._vtable_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
+
+        else:
+            self._syntax_warning(ParserError.BOGUS_MARKER)
+
+    def read_line(self, line: str):
+        self.last_line = line  # TODO: Useful or hack for error reporting?
+        self.line_number += 1
+
+        marker = match_marker(line)
+        if marker is not None:
+            # TODO: what's the best place for this?
+            # Does it belong with reading or marker handling?
+            if not is_marker_exact(self.last_line):
+                self._syntax_warning(ParserError.BAD_DECOMP_MARKER)
+            self._handle_marker(marker)
+            return
+
+        if self.state == ReaderState.IN_TEMPLATE:
             # TEMPLATE functions are a special case. The signature is
             # given on the next line (in a // comment)
-            function_sig = get_template_function_name(line)
-            start_line = line_no
-            end_line = line_no
-            state = ReaderState.FUNCTION_DONE
+            self.function_sig = get_template_function_name(line)
+            self._function_starts_here()
+            self._function_done()
 
-        elif state == ReaderState.WANT_SIG:
+        elif self.state == ReaderState.WANT_SIG:
             # Skip blank lines or comments that come after the offset
             # marker. There is not a formal procedure for this, so just
             # assume the next "code line" is the function signature
             if not is_blank_or_comment(line):
                 # Inline functions may end with a comment. Strip that out
                 # to help parsing.
-                function_sig = remove_trailing_comment(line.strip())
+                self.function_sig = remove_trailing_comment(line.strip())
 
                 # Now check to see if the opening curly bracket is on the
                 # same line. clang-format should prevent this (BraceWrapping)
                 # but it is easy to detect.
                 # If the entire function is on one line, handle that too.
-                if function_sig.endswith("{"):
-                    start_line = line_no
-                    state = ReaderState.IN_FUNC
-                elif function_sig.endswith("}") or function_sig.endswith("};"):
-                    start_line = line_no
-                    end_line = line_no
-                    state = ReaderState.FUNCTION_DONE
+                if self.function_sig.endswith("{"):
+                    self._function_starts_here()
+                    self.state = ReaderState.IN_FUNC
+                elif self.function_sig.endswith("}") or self.function_sig.endswith(
+                    "};"
+                ):
+                    self._function_starts_here()
+                    self._function_done()
                 else:
-                    state = ReaderState.WANT_CURLY
+                    self.state = ReaderState.WANT_CURLY
 
-        elif state == ReaderState.WANT_CURLY:
+        elif self.state == ReaderState.WANT_CURLY:
             if line.strip() == "{":
-                start_line = line_no
-                state = ReaderState.IN_FUNC
+                self._function_starts_here()
+                self.state = ReaderState.IN_FUNC
 
-        elif state == ReaderState.IN_FUNC:
+        elif self.state == ReaderState.IN_FUNC:
             # Naive but reasonable assumption that functions will end with
             # a curly brace on its own line with no prepended spaces.
             if line.startswith("}"):
-                end_line = line_no
-                state = ReaderState.FUNCTION_DONE
+                self._function_done()
 
-    return blocks
+        elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):
+            if not is_blank_or_comment(line):
+                self._variable_done()
+
+        elif self.state == ReaderState.IN_VTABLE:
+            if not is_blank_or_comment(line):
+                self._vtable_done()
+
+    def read_lines(self, lines: Iterable):
+        for line in lines:
+            self.read_line(line)
+
+
+def find_code_blocks(stream: TextIO) -> List[ParserNode]:
+    """Read the IO stream (file) line-by-line and give the following report:
+    Foreach code block (function) in the file, what are its starting and
+    ending line numbers, and what is the given offset in the original
+    binary. We expect the result to be ordered by line number because we
+    are reading the file from start to finish."""
+
+    # TODO: this will be replaced shortly. shim for now to avoid
+    # making more changes elsewhere
+    p = DecompParser()
+    for line in stream:
+        p.read_line(line)
+
+    return p.functions
diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
index 59fca75b..02d3c976 100644
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -4,41 +4,15 @@
 from typing import List
 from collections import namedtuple
 
+DecompMarker = namedtuple("DecompMarker", ["type", "module", "offset"])
 
-CodeBlock = namedtuple(
-    "CodeBlock",
-    [
-        "offset",
-        "signature",
-        "start_line",
-        "end_line",
-        "offset_comment",
-        "module",
-        "is_template",
-        "is_stub",
-    ],
-)
 
-OffsetMatch = namedtuple(
-    "OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
-)
-
-# This has not been formally established, but considering that "STUB"
-# is a temporary state for a function, we assume it will appear last,
-# after any other modifiers (i.e. TEMPLATE)
-
-# To match a reasonable variance of formatting for the offset comment
-offsetCommentRegex = re.compile(
-    r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?",  # nopep8
+markerRegex = re.compile(
+    r"\s*//\s*(\w+):\s*(\w+)\s+((?:0x)?[a-f0-9]+)",
     flags=re.I,
 )
 
-# To match the exact syntax (text upper case, hex lower case, with spaces)
-# that is used in most places
-offsetCommentExactRegex = re.compile(
-    r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
-)  # nopep8
-
+markerExactRegex = re.compile(r"// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$")
 
 # The goal here is to just read whatever is on the next line, so some
 # flexibility in the formatting seems OK
@@ -78,39 +52,15 @@ def is_blank_or_comment(line: str) -> bool:
     )
 
 
-def is_exact_offset_comment(line: str) -> bool:
-    """If the offset comment does not match our (unofficial) syntax
-    we may want to alert the user to fix it for style points."""
-    return offsetCommentExactRegex.match(line) is not None
-
-
-def match_offset_comment(line: str) -> OffsetMatch | None:
-    match = offsetCommentRegex.match(line)
+def match_marker(line: str) -> DecompMarker | None:
+    match = markerRegex.match(line)
     if match is None:
         return None
 
-    return OffsetMatch(
-        module=match.group(1),
-        address=int(match.group(2), 16),
-        is_template=match.group(3) is not None,
-        is_stub=match.group(4) is not None,
-        comment=line.strip(),
+    return DecompMarker(
+        type=match.group(1), module=match.group(2), offset=int(match.group(3), 16)
     )
 
 
-def distinct_by_module(offsets: List) -> List:
-    """Given a list of offset markers, return a list with distinct
-    module names. If module names (case-insensitive) are repeated,
-    choose the offset that appears first."""
-
-    if len(offsets) < 2:
-        return offsets
-
-    # Dict maintains insertion order in python >=3.7
-    offsets_dict = {}
-    for offset in offsets:
-        module_upper = offset.module.upper()
-        if module_upper not in offsets_dict:
-            offsets_dict[module_upper] = offset
-
-    return list(offsets_dict.values())
+def is_marker_exact(line: str) -> bool:
+    return markerExactRegex.match(line) is not None
diff --git a/tools/isledecomp/tests/samples/basic_class.cpp b/tools/isledecomp/tests/samples/basic_class.cpp
index 23ce3c39..4316ad4a 100644
--- a/tools/isledecomp/tests/samples/basic_class.cpp
+++ b/tools/isledecomp/tests/samples/basic_class.cpp
@@ -3,6 +3,7 @@
 
 // A very simple class
 
+// VTABLE: TEST 0x1001002
 class TestClass {
 public:
   TestClass();
@@ -10,14 +11,14 @@ class TestClass {
 
   virtual MxResult Tickle() override; // vtable+08
 
-  // OFFSET: TEST 0x12345678
+  // FUNCTION: TEST 0x12345678
   inline const char* ClassName() const // vtable+0c
   {
     // 0xabcd1234
     return "TestClass";
   }
 
-  // OFFSET: TEST 0xdeadbeef
+  // FUNCTION: TEST 0xdeadbeef
   inline MxBool IsA(const char* name) const override // vtable+10
   {
     return !strcmp(name, TestClass::ClassName());
diff --git a/tools/isledecomp/tests/samples/basic_file.cpp b/tools/isledecomp/tests/samples/basic_file.cpp
index 6a4017b5..99930de8 100644
--- a/tools/isledecomp/tests/samples/basic_file.cpp
+++ b/tools/isledecomp/tests/samples/basic_file.cpp
@@ -3,19 +3,19 @@
 
 // A very simple well-formed code file
 
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void function01()
 {
   // TODO
 }
 
-// OFFSET: TEST 0x2345
+// FUNCTION: TEST 0x2345
 void function02()
 {
   // TODO
 }
 
-// OFFSET: TEST 0x3456
+// FUNCTION: TEST 0x3456
 void function03()
 {
   // TODO
diff --git a/tools/isledecomp/tests/samples/global_variables.cpp b/tools/isledecomp/tests/samples/global_variables.cpp
new file mode 100644
index 00000000..3be0316a
--- /dev/null
+++ b/tools/isledecomp/tests/samples/global_variables.cpp
@@ -0,0 +1,14 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// Global variables inside and outside of functions
+
+// GLOBAL: TEST 0x1000
+const char *g_message = "test";
+
+// FUNCTION: TEST 0x1234
+void function01()
+{
+  // GLOBAL: TEST 0x5555
+  static int g_hello = 123;
+}
diff --git a/tools/isledecomp/tests/samples/inline.cpp b/tools/isledecomp/tests/samples/inline.cpp
index 0bfedf6d..8a36c89a 100644
--- a/tools/isledecomp/tests/samples/inline.cpp
+++ b/tools/isledecomp/tests/samples/inline.cpp
@@ -1,8 +1,8 @@
 // Sample for python unit tests
 // Not part of the decomp
 
-// OFFSET: TEST 0x10000001
+// FUNCTION: TEST 0x10000001
 inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
 
-// OFFSET: TEST 0x10000002
+// FUNCTION: TEST 0x10000002
 inline const char* OneLine() const { return "MxDSObject"; };
diff --git a/tools/isledecomp/tests/samples/missing_offset.cpp b/tools/isledecomp/tests/samples/missing_offset.cpp
index 332fed2c..3f6b3811 100644
--- a/tools/isledecomp/tests/samples/missing_offset.cpp
+++ b/tools/isledecomp/tests/samples/missing_offset.cpp
@@ -9,7 +9,7 @@ int no_offset_comment()
   return -1;
 }
 
-// OFFSET: TEST 0xdeadbeef
+// FUNCTION: TEST 0xdeadbeef
 void regular_ole_function()
 {
   printf("hi there");
diff --git a/tools/isledecomp/tests/samples/multiple_offsets.cpp b/tools/isledecomp/tests/samples/multiple_offsets.cpp
index eecdd95b..dc3c5bfa 100644
--- a/tools/isledecomp/tests/samples/multiple_offsets.cpp
+++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp
@@ -3,22 +3,22 @@
 
 // Handling multiple offset markers
 
-// OFFSET: TEST 0x1234
-// OFFSET: HELLO 0x5555
+// FUNCTION: TEST 0x1234
+// FUNCTION: HELLO 0x5555
 void different_modules()
 {
   // TODO
 }
 
-// OFFSET: TEST 0x2345
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x2345
+// FUNCTION: TEST 0x1234
 void same_module()
 {
   // TODO
 }
 
-// OFFSET: TEST 0x2002
-// OFFSET: test 0x1001
+// FUNCTION: TEST 0x2002
+// FUNCTION: test 0x1001
 void same_case_insensitive()
 {
   // TODO
diff --git a/tools/isledecomp/tests/samples/oneline_function.cpp b/tools/isledecomp/tests/samples/oneline_function.cpp
index 8d7fdc5a..feb82314 100644
--- a/tools/isledecomp/tests/samples/oneline_function.cpp
+++ b/tools/isledecomp/tests/samples/oneline_function.cpp
@@ -1,10 +1,10 @@
 // Sample for python unit tests
 // Not part of the decomp
 
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void short_function() { static char* msg = "oneliner"; }
 
-// OFFSET: TEST 0x5555
+// FUNCTION: TEST 0x5555
 void function_after_one_liner()
 {
   // This function comes after the previous that is on a single line.
diff --git a/tools/isledecomp/tests/samples/out_of_order.cpp b/tools/isledecomp/tests/samples/out_of_order.cpp
index 749c4f2b..951c99e7 100644
--- a/tools/isledecomp/tests/samples/out_of_order.cpp
+++ b/tools/isledecomp/tests/samples/out_of_order.cpp
@@ -1,19 +1,19 @@
 // Sample for python unit tests
 // Not part of the decomp
 
-// OFFSET: TEST 0x1001
+// FUNCTION: TEST 0x1001
 void function_order01()
 {
     // TODO
 }
 
-// OFFSET: TEST 0x1003
+// FUNCTION: TEST 0x1003
 void function_order03()
 {
     // TODO
 }
 
-// OFFSET: TEST 0x1002
+// FUNCTION: TEST 0x1002
 void function_order02()
 {
     // TODO
diff --git a/tools/isledecomp/tests/samples/poorly_formatted.cpp b/tools/isledecomp/tests/samples/poorly_formatted.cpp
index 32dd774c..69f365ec 100644
--- a/tools/isledecomp/tests/samples/poorly_formatted.cpp
+++ b/tools/isledecomp/tests/samples/poorly_formatted.cpp
@@ -4,18 +4,18 @@
 // While it's reasonable to expect a well-formed file (and clang-format
 // will make sure we get one), this will put the parser through its paces.
 
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void curly_with_spaces()
   {
   static char* msg = "hello";
   }
 
-// OFFSET: TEST 0x5555
+// FUNCTION: TEST 0x5555
 void weird_closing_curly()
 {
   int x = 123; }
 
-// OFFSET: HELLO 0x5656
+// FUNCTION: HELLO 0x5656
 void bad_indenting() {
   if (0)
 {
diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
index 48bb0e44..7731918f 100644
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -1,127 +1,170 @@
-import os
-from typing import List, TextIO
-from isledecomp.parser import find_code_blocks
-from isledecomp.parser.util import CodeBlock
-
-SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
+import pytest
+from isledecomp.parser.parser import (
+    ReaderState,
+    DecompParser,
+)
+from isledecomp.parser.util import DecompMarker
+from isledecomp.parser.error import ParserError
 
 
-def sample_file(filename: str) -> TextIO:
-    """Wrapper for opening the samples from the directory that does not
-    depend on the cwd where we run the test"""
-    full_path = os.path.join(SAMPLE_DIR, filename)
-    return open(full_path, "r", encoding="utf-8")
+@pytest.fixture
+def parser():
+    return DecompParser()
 
 
-def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
-    """Helper to make this more idiomatic"""
-    just_offsets = [block.offset for block in blocks]
-    return just_offsets == sorted(just_offsets)
+@pytest.mark.skip(reason="todo")
+def test_missing_sig(parser):
+    """Bad syntax: function signature is missing"""
+    parser.read_lines(["// FUNCTION: TEST 0x1234", "{"])
+    assert parser.state == ReaderState.IN_FUNC
+    assert len(parser.alerts) == 1
+    parser.read_line("}")
+    assert len(parser.functions) == 1
+    assert parser.functions[0] != "{"
 
 
-# Tests are below #
+def test_not_exact_syntax(parser):
+    """Alert to inexact syntax right here in the parser instead of kicking it downstream.
+    Doing this means we don't have to save the actual text."""
+    parser.read_line("// function: test 1234")
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER
 
 
-def test_sanity():
-    """Read a very basic file"""
-    with sample_file("basic_file.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_invalid_marker(parser):
+    """We matched a decomp marker, but it's not one we care about"""
+    parser.read_line("// BANANA: TEST 0x1234")
+    assert parser.state == ReaderState.SEARCH
 
-    assert len(blocks) == 3
-    assert code_blocks_are_sorted(blocks) is True
-    # n.b. The parser returns line numbers as 1-based
-    # Function starts when we see the opening curly brace
-    assert blocks[0].start_line == 8
-    assert blocks[0].end_line == 10
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BOGUS_MARKER
 
 
-def test_oneline():
-    """(Assuming clang-format permits this) This sample has a function
-    on a single line. This will test the end-of-function detection"""
-    with sample_file("oneline_function.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    assert len(blocks) == 2
-    assert blocks[0].start_line == 5
-    assert blocks[0].end_line == 5
+def test_unexpected_marker(parser):
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "// GLOBAL: TEST 0x5000",
+        ]
+    )
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
 
 
-def test_missing_offset():
-    """What if the function doesn't have an offset comment?"""
-    with sample_file("missing_offset.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    # TODO: For now, the function without the offset will just be ignored.
-    # Would be the same outcome if the comment was present but mangled and
-    # we failed to match it. We should detect these cases in the future.
-    assert len(blocks) == 1
+def test_variable(parser):
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "int g_value = 5;",
+        ]
+    )
+    assert len(parser.variables) == 1
 
 
-def test_jumbled_case():
-    """The parser just reports what it sees. It is the responsibility of
-    the downstream tools to do something about a jumbled file.
-    Just verify that we are reading it correctly."""
-    with sample_file("out_of_order.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    assert len(blocks) == 3
-    assert code_blocks_are_sorted(blocks) is False
+def test_synthetic_plus_marker(parser):
+    """Should fail with error and not log the synthetic"""
+    parser.read_lines(
+        [
+            "// SYNTHETIC: HEY 0x555",
+            "// FUNCTION: HOWDY 0x1234",
+        ]
+    )
+    assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
 
 
-def test_bad_file():
-    with sample_file("poorly_formatted.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_different_markers_different_module(parser):
+    """Does it make any sense for a function to be a stub in one module,
+    but not in another? I don't know. But it's no problem for us."""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// STUB: SUP 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )
 
-    assert len(blocks) == 3
+    assert len(parser.alerts) == 0
+    assert len(parser.functions) == 2
 
 
-def test_indented():
-    """Offsets for functions inside of a class will probably be indented."""
-    with sample_file("basic_class.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_different_markers_same_module(parser):
+    """Now, if something is a regular function but then a stub,
+    what do we say about that?"""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// STUB: HOWDY 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )
 
-    # TODO: We don't properly detect the end of these functions
-    # because the closing brace is indented. However... knowing where each
-    # function ends is less important (for now) than capturing
-    # all the functions that are there.
+    # Use first marker declaration, don't replace
+    assert len(parser.functions) == 1
+    assert parser.functions[0].is_stub is False
 
-    assert len(blocks) == 2
-    assert blocks[0].offset == int("0x12345678", 16)
-    assert blocks[0].start_line == 15
-    # assert blocks[0].end_line == 18
-
-    assert blocks[1].offset == int("0xdeadbeef", 16)
-    assert blocks[1].start_line == 22
-    # assert blocks[1].end_line == 24
+    # Should alert to this
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
 
 
-def test_inline():
-    with sample_file("inline.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_unexpected_synthetic(parser):
+    """FUNCTION then SYNTHETIC should fail to report either one"""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// SYNTHETIC: HOWDY 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )
 
-    assert len(blocks) == 2
-    for block in blocks:
-        assert block.start_line is not None
-        assert block.start_line == block.end_line
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
 
 
-def test_multiple_offsets():
-    """If multiple offset marks appear before for a code block, take them
-    all but ensure module name (case-insensitive) is distinct.
-    Use first module occurrence in case of duplicates."""
-    with sample_file("multiple_offsets.cpp") as f:
-        blocks = find_code_blocks(f)
+@pytest.mark.skip(reason="not implemented yet")
+def test_duplicate_offset(parser):
+    """Repeating the same module/offset in the same file is probably a typo"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "int x = 1;",
+            "// GLOBAL: HELLO 0x1234",
+            "int y = 2;",
+        ]
+    )
 
-    assert len(blocks) == 4
-    assert blocks[0].module == "TEST"
-    assert blocks[0].start_line == 9
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_OFFSET
 
-    assert blocks[1].module == "HELLO"
-    assert blocks[1].start_line == 9
 
-    # Duplicate modules are ignored
-    assert blocks[2].start_line == 16
-    assert blocks[2].offset == 0x2345
+def test_multiple_variables(parser):
+    """Theoretically the same global variable can appear in multiple modules"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "// GLOBAL: WUZZUP 0x555",
+            "const char *g_greeting;",
+        ]
+    )
+    assert len(parser.alerts) == 0
+    assert len(parser.variables) == 2
 
-    assert blocks[3].module == "TEST"
-    assert blocks[3].offset == 0x2002
+
+def test_multiple_vtables(parser):
+    parser.read_lines(
+        [
+            "// VTABLE: HELLO 0x1234",
+            "// VTABLE: TEST 0x5432",
+            "class MxString : public MxCore {",
+        ]
+    )
+    assert len(parser.alerts) == 0
+    assert len(parser.vtables) == 2
diff --git a/tools/isledecomp/tests/test_parser_samples.py b/tools/isledecomp/tests/test_parser_samples.py
new file mode 100644
index 00000000..a045e3cc
--- /dev/null
+++ b/tools/isledecomp/tests/test_parser_samples.py
@@ -0,0 +1,141 @@
+import os
+import pytest
+from typing import List, TextIO
+from isledecomp.parser import DecompParser
+from isledecomp.parser.node import ParserSymbol
+
+SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
+
+
+def sample_file(filename: str) -> TextIO:
+    """Wrapper for opening the samples from the directory that does not
+    depend on the cwd where we run the test"""
+    full_path = os.path.join(SAMPLE_DIR, filename)
+    return open(full_path, "r", encoding="utf-8")
+
+
+def code_blocks_are_sorted(blocks: List[ParserSymbol]) -> bool:
+    """Helper to make this more idiomatic"""
+    just_offsets = [block.offset for block in blocks]
+    return just_offsets == sorted(just_offsets)
+
+
+@pytest.fixture
+def parser():
+    return DecompParser()
+
+
+# Tests are below #
+
+
+def test_sanity(parser):
+    """Read a very basic file"""
+    with sample_file("basic_file.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 3
+    assert code_blocks_are_sorted(parser.functions) is True
+    # n.b. The parser returns line numbers as 1-based
+    # Function starts when we see the opening curly brace
+    assert parser.functions[0].line_number == 8
+    assert parser.functions[0].end_line == 10
+
+
+def test_oneline(parser):
+    """(Assuming clang-format permits this) This sample has a function
+    on a single line. This will test the end-of-function detection"""
+    with sample_file("oneline_function.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 2
+    assert parser.functions[0].line_number == 5
+    assert parser.functions[0].end_line == 5
+
+
+def test_missing_offset(parser):
+    """What if the function doesn't have an offset comment?"""
+    with sample_file("missing_offset.cpp") as f:
+        parser.read_lines(f)
+
+    # TODO: For now, the function without the offset will just be ignored.
+    # Would be the same outcome if the comment was present but mangled and
+    # we failed to match it. We should detect these cases in the future.
+    assert len(parser.functions) == 1
+
+
+def test_jumbled_case(parser):
+    """The parser just reports what it sees. It is the responsibility of
+    the downstream tools to do something about a jumbled file.
+    Just verify that we are reading it correctly."""
+    with sample_file("out_of_order.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 3
+    assert code_blocks_are_sorted(parser.functions) is False
+
+
+def test_bad_file(parser):
+    with sample_file("poorly_formatted.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 3
+
+
+def test_indented(parser):
+    """Offsets for functions inside of a class will probably be indented."""
+    with sample_file("basic_class.cpp") as f:
+        parser.read_lines(f)
+
+    # TODO: We don't properly detect the end of these functions
+    # because the closing brace is indented. However... knowing where each
+    # function ends is less important (for now) than capturing
+    # all the functions that are there.
+
+    assert len(parser.functions) == 2
+    assert parser.functions[0].offset == int("0x12345678", 16)
+    assert parser.functions[0].line_number == 16
+    # assert parser.functions[0].end_line == 19
+
+    assert parser.functions[1].offset == int("0xdeadbeef", 16)
+    assert parser.functions[1].line_number == 23
+    # assert parser.functions[1].end_line == 25
+
+
+def test_inline(parser):
+    with sample_file("inline.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 2
+    for fun in parser.functions:
+        assert fun.line_number is not None
+        assert fun.line_number == fun.end_line
+
+
+def test_multiple_offsets(parser):
+    """If multiple offset marks appear before for a code block, take them
+    all but ensure module name (case-insensitive) is distinct.
+    Use first module occurrence in case of duplicates."""
+    with sample_file("multiple_offsets.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 4
+    assert parser.functions[0].module == "TEST"
+    assert parser.functions[0].line_number == 9
+
+    assert parser.functions[1].module == "HELLO"
+    assert parser.functions[1].line_number == 9
+
+    # Duplicate modules are ignored
+    assert parser.functions[2].line_number == 16
+    assert parser.functions[2].offset == 0x2345
+
+    assert parser.functions[3].module == "TEST"
+    assert parser.functions[3].offset == 0x2002
+
+
+def test_variables(parser):
+    with sample_file("global_variables.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 1
+    assert len(parser.variables) == 2
diff --git a/tools/isledecomp/tests/test_parser_statechange.py b/tools/isledecomp/tests/test_parser_statechange.py
new file mode 100644
index 00000000..62c19175
--- /dev/null
+++ b/tools/isledecomp/tests/test_parser_statechange.py
@@ -0,0 +1,150 @@
+import pytest
+from isledecomp.parser.parser import (
+    ReaderState as _rs,
+    DecompParser,
+)
+from isledecomp.parser.util import DecompMarker
+from isledecomp.parser.error import ParserError as _pe
+
+# fmt: off
+state_change_marker_cases = [
+    (_rs.SEARCH,          "FUNCTION",   _rs.WANT_SIG,        None),
+    (_rs.SEARCH,          "GLOBAL",     _rs.IN_GLOBAL,       None),
+    (_rs.SEARCH,          "STUB",       _rs.WANT_SIG,        None),
+    (_rs.SEARCH,          "SYNTHETIC",  _rs.IN_TEMPLATE,     None),
+    (_rs.SEARCH,          "TEMPLATE",   _rs.IN_TEMPLATE,     None),
+    (_rs.SEARCH,          "VTABLE",     _rs.IN_VTABLE,       None),
+
+    (_rs.WANT_SIG,        "FUNCTION",   _rs.WANT_SIG,        None),
+    (_rs.WANT_SIG,        "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.WANT_SIG,        "STUB",       _rs.WANT_SIG,        None),
+    (_rs.WANT_SIG,        "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.WANT_SIG,        "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.WANT_SIG,        "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+
+    (_rs.IN_FUNC,         "FUNCTION",   _rs.WANT_SIG,        _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
+    (_rs.IN_FUNC,         "STUB",       _rs.WANT_SIG,        _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "SYNTHETIC",  _rs.IN_TEMPLATE,     _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "TEMPLATE",   _rs.IN_TEMPLATE,     _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "VTABLE",     _rs.IN_VTABLE,       _pe.MISSED_END_OF_FUNCTION),
+
+    (_rs.IN_TEMPLATE,     "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_TEMPLATE,     "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_TEMPLATE,     "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_TEMPLATE,     "SYNTHETIC",  _rs.IN_TEMPLATE,     None),
+    (_rs.IN_TEMPLATE,     "TEMPLATE",   _rs.IN_TEMPLATE,     None),
+    (_rs.IN_TEMPLATE,     "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    
+    (_rs.WANT_CURLY,      "FUNCTION",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "GLOBAL",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "STUB",       _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "SYNTHETIC",  _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "TEMPLATE",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "VTABLE",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    
+    (_rs.IN_GLOBAL,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "GLOBAL",     _rs.IN_GLOBAL,       None),
+    (_rs.IN_GLOBAL,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    
+    (_rs.IN_FUNC_GLOBAL,  "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
+    (_rs.IN_FUNC_GLOBAL,  "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    
+    (_rs.IN_VTABLE,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "VTABLE",     _rs.IN_VTABLE,       None),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize(
+    "state, marker_type, new_state, expected_error", state_change_marker_cases
+)
+def test_state_change_by_marker(
+    state: _rs, marker_type: str, new_state: _rs, expected_error: None | _pe
+):
+    p = DecompParser()
+    p.state = state
+    p._handle_marker(DecompMarker(marker_type, "TEST", 0x1234))
+    assert p.state == new_state
+
+    if expected_error is not None:
+        assert len(p.alerts) > 0
+        assert p.alerts[0].code == expected_error
+
+
+# Reading any of these lines should have no effect in ReaderState.SEARCH
+search_lines_no_effect = [
+    "",
+    "\t",
+    "    ",
+    "int x = 0;",
+    "// Comment",
+    "/*",
+    "*/",
+    "/* Block comment */",
+    "{",
+    "}",
+]
+
+
+@pytest.mark.parametrize("line", search_lines_no_effect)
+def test_state_search_line(line: str):
+    p = DecompParser()
+    p.read_line(line)
+    assert p.state == _rs.SEARCH
+    assert len(p.alerts) == 0
+
+
+global_lines = [
+    ("// A comment", _rs.IN_GLOBAL),
+    ("", _rs.IN_GLOBAL),
+    ("\t", _rs.IN_GLOBAL),
+    ("    ", _rs.IN_GLOBAL),
+    # TODO: no check for "likely" variable declaration so these all count
+    ("void function()", _rs.SEARCH),
+    ("int x = 123;", _rs.SEARCH),
+    ("just some text", _rs.SEARCH),
+]
+
+
+@pytest.mark.parametrize("line, new_state", global_lines)
+def test_state_global_line(line: str, new_state: _rs):
+    p = DecompParser()
+    p.read_line("// GLOBAL: TEST 0x1234")
+    assert p.state == _rs.IN_GLOBAL
+    p.read_line(line)
+    assert p.state == new_state
+
+
+# mostly same as above
+in_func_global_lines = [
+    ("// A comment", _rs.IN_FUNC_GLOBAL),
+    ("", _rs.IN_FUNC_GLOBAL),
+    ("\t", _rs.IN_FUNC_GLOBAL),
+    ("    ", _rs.IN_FUNC_GLOBAL),
+    # TODO: no check for "likely" variable declaration so these all count
+    ("void function()", _rs.IN_FUNC),
+    ("int x = 123;", _rs.IN_FUNC),
+    ("just some text", _rs.IN_FUNC),
+]
+
+
+@pytest.mark.parametrize("line, new_state", in_func_global_lines)
+def test_state_in_func_global_line(line: str, new_state: _rs):
+    p = DecompParser()
+    p.state = _rs.IN_FUNC
+    p.read_line("// GLOBAL: TEST 0x1234")
+    assert p.state == _rs.IN_FUNC_GLOBAL
+    p.read_line(line)
+    assert p.state == new_state
diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py
index 91fd285b..bbbf2d3a 100644
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -1,11 +1,12 @@
 from collections import namedtuple
 from typing import List
 import pytest
+from isledecomp.parser.parser import MarkerDict
 from isledecomp.parser.util import (
+    DecompMarker,
     is_blank_or_comment,
-    match_offset_comment,
-    is_exact_offset_comment,
-    distinct_by_module,
+    match_marker,
+    is_marker_exact,
 )
 
 
@@ -28,76 +29,72 @@ def test_is_blank_or_comment(line: str, expected: bool):
     assert is_blank_or_comment(line) is expected
 
 
-offset_comment_samples = [
+marker_samples = [
     # (can_parse: bool, exact_match: bool, line: str)
-    # Should match both expected modules with optional STUB marker
-    (True, True, "// OFFSET: LEGO1 0xdeadbeef"),
-    (True, True, "// OFFSET: LEGO1 0xdeadbeef STUB"),
-    (True, True, "// OFFSET: ISLE 0x12345678"),
-    (True, True, "// OFFSET: ISLE 0x12345678 STUB"),
+    (True, True, "// FUNCTION: LEGO1 0xdeadbeef"),
+    (True, True, "// FUNCTION: ISLE 0x12345678"),
     # No trailing spaces allowed
-    (True, False, "// OFFSET: LEGO1 0xdeadbeef  "),
-    (True, False, "// OFFSET: LEGO1 0xdeadbeef STUB "),
+    (True, False, "// FUNCTION: LEGO1 0xdeadbeef  "),
     # Must have exactly one space between elements
-    (True, False, "//OFFSET: ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET:ISLE 0xdeadbeef"),
-    (True, False, "//  OFFSET: ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET:  ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET: ISLE  0xdeadbeef"),
-    (True, False, "// OFFSET: ISLE 0xdeadbeef  STUB"),
+    (True, False, "//FUNCTION: ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION:ISLE 0xdeadbeef"),
+    (True, False, "//  FUNCTION: ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION:  ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION: ISLE  0xdeadbeef"),
     # Must have 0x prefix for hex number
-    (True, False, "// OFFSET: ISLE deadbeef"),
+    (True, False, "// FUNCTION: ISLE deadbeef"),
     # Offset, module name, and STUB must be uppercase
-    (True, False, "// offset: ISLE 0xdeadbeef"),
-    (True, False, "// offset: isle 0xdeadbeef"),
-    (True, False, "// OFFSET: LEGO1 0xdeadbeef stub"),
+    (True, False, "// function: ISLE 0xdeadbeef"),
+    (True, False, "// function: isle 0xdeadbeef"),
     # Hex string must be lowercase
-    (True, False, "// OFFSET: ISLE 0xDEADBEEF"),
+    (True, False, "// FUNCTION: ISLE 0xDEADBEEF"),
     # TODO: How flexible should we be with matching the module name?
-    (True, True, "// OFFSET: OMNI 0x12345678"),
-    (True, True, "// OFFSET: LEG01 0x12345678"),
-    (True, False, "// OFFSET: hello 0x12345678"),
+    (True, True, "// FUNCTION: OMNI 0x12345678"),
+    (True, True, "// FUNCTION: LEG01 0x12345678"),
+    (True, False, "// FUNCTION: hello 0x12345678"),
     # Not close enough to match
-    (False, False, "// OFFSET: ISLE0x12345678"),
-    (False, False, "// OFFSET: 0x12345678"),
+    (False, False, "// FUNCTION: ISLE0x12345678"),
+    (False, False, "// FUNCTION: 0x12345678"),
     (False, False, "// LEGO1: 0x12345678"),
     # Hex string shorter than 8 characters
-    (True, True, "// OFFSET: LEGO1 0x1234"),
+    (True, True, "// FUNCTION: LEGO1 0x1234"),
     # TODO: These match but shouldn't.
-    # (False, False, '// OFFSET: LEGO1 0'),
-    # (False, False, '// OFFSET: LEGO1 0x'),
+    # (False, False, '// FUNCTION: LEGO1 0'),
+    # (False, False, '// FUNCTION: LEGO1 0x'),
 ]
 
 
-@pytest.mark.parametrize("match, _, line", offset_comment_samples)
-def test_offset_match(line: str, match: bool, _):
-    did_match = match_offset_comment(line) is not None
+@pytest.mark.parametrize("match, _, line", marker_samples)
+def test_marker_match(line: str, match: bool, _):
+    did_match = match_marker(line) is not None
     assert did_match is match
 
 
-@pytest.mark.parametrize("_, exact, line", offset_comment_samples)
-def test_exact_offset_comment(line: str, exact: bool, _):
-    assert is_exact_offset_comment(line) is exact
+@pytest.mark.parametrize("_, exact, line", marker_samples)
+def test_marker_exact(line: str, exact: bool, _):
+    assert is_marker_exact(line) is exact
 
 
-# Helper for the next test: cut down version of OffsetMatch
-MiniOfs = namedtuple("MiniOfs", ["module", "value"])
-
-distinct_by_module_samples = [
-    # empty set
-    ([], []),
-    # same module name
-    ([MiniOfs("TEST", 123), MiniOfs("TEST", 555)], [MiniOfs("TEST", 123)]),
-    # same module name, case-insensitive
-    ([MiniOfs("test", 123), MiniOfs("TEST", 555)], [MiniOfs("test", 123)]),
-    # duplicates, non-consecutive
-    (
-        [MiniOfs("test", 123), MiniOfs("abc", 111), MiniOfs("TEST", 555)],
-        [MiniOfs("test", 123), MiniOfs("abc", 111)],
-    ),
-]
+def test_marker_dict_simple():
+    d = MarkerDict()
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
+    markers = list(d.iter())
+    assert len(markers) == 1
 
 
-@pytest.mark.parametrize("sample, expected", distinct_by_module_samples)
-def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
-    assert distinct_by_module(sample) == expected
+def test_marker_dict_ofs_replace():
+    d = MarkerDict()
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x555))
+    markers = list(d.iter())
+    assert len(markers) == 1
+    assert markers[0].offset == 0x1234
+
+
+def test_marker_dict_type_replace():
+    d = MarkerDict()
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
+    d.insert(DecompMarker("STUB", "TEST", 0x1234))
+    markers = list(d.iter())
+    assert len(markers) == 1
+    assert markers[0].type == "FUNCTION"
diff --git a/tools/reccmp/reccmp.py b/tools/reccmp/reccmp.py
index 02c16029..1d4cd5b1 100755
--- a/tools/reccmp/reccmp.py
+++ b/tools/reccmp/reccmp.py
@@ -10,7 +10,7 @@
 
 from isledecomp import (
     Bin,
-    find_code_blocks,
+    DecompParser,
     get_file_in_script_dir,
     OffsetPlaceholderGenerator,
     print_diff,
@@ -313,18 +313,20 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
         # Generate basename of original file, used in locating OFFSET lines
         basename = os.path.basename(os.path.splitext(original)[0])
 
+        parser = DecompParser()
         for srcfilename in walk_source_dir(source):
+            parser.reset()
             with open(srcfilename, "r", encoding="utf-8") as srcfile:
-                blocks = find_code_blocks(srcfile)
+                parser.read_lines(srcfile)
 
-            for block in blocks:
-                if block.is_stub:
+            for fun in parser.functions:
+                if fun.is_stub:
                     continue
 
-                if block.module != basename:
+                if fun.module != basename:
                     continue
 
-                addr = block.offset
+                addr = fun.offset
                 # Verbose flag handling
                 if verbose:
                     if addr == verbose:
@@ -332,13 +334,13 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
                     else:
                         continue
 
-                if block.is_template:
-                    recinfo = syminfo.get_recompiled_address_from_name(block.signature)
+                if fun.is_template:
+                    recinfo = syminfo.get_recompiled_address_from_name(fun.name)
                     if not recinfo:
                         continue
                 else:
                     recinfo = syminfo.get_recompiled_address(
-                        srcfilename, block.start_line
+                        srcfilename, fun.line_number
                     )
                     if not recinfo:
                         continue

From 5a4c9234a9ef1471e68dcdfc1af6a87944d8bbdc Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 15:12:33 -0500
Subject: [PATCH 02/17] Allow prepending space for exact marker match

---
 tools/isledecomp/isledecomp/parser/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
index 02d3c976..b62f0586 100644
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -12,7 +12,7 @@
     flags=re.I,
 )
 
-markerExactRegex = re.compile(r"// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$")
+markerExactRegex = re.compile(r"\s*// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$")
 
 # The goal here is to just read whatever is on the next line, so some
 # flexibility in the formatting seems OK

From 93ed16e74afb57f01addafdaa49e62114220aa1f Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:17:09 -0500
Subject: [PATCH 03/17] To eliminate noise, require the 0x prefix on offset for
 marker match

---
 tools/isledecomp/isledecomp/parser/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
index b62f0586..f93b4475 100644
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -8,7 +8,7 @@
 
 
 markerRegex = re.compile(
-    r"\s*//\s*(\w+):\s*(\w+)\s+((?:0x)?[a-f0-9]+)",
+    r"\s*//\s*(\w+):\s*(\w+)\s+(0x[a-f0-9]+)",
     flags=re.I,
 )
 

From 1ba8a93bec5df424c0c505f52eeeebeaf2fa6434 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:39:38 -0500
Subject: [PATCH 04/17] fix test from previous

---
 tools/isledecomp/tests/test_parser.py      | 2 +-
 tools/isledecomp/tests/test_parser_util.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
index 7731918f..69764f74 100644
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -26,7 +26,7 @@ def test_missing_sig(parser):
 def test_not_exact_syntax(parser):
     """Alert to inexact syntax right here in the parser instead of kicking it downstream.
     Doing this means we don't have to save the actual text."""
-    parser.read_line("// function: test 1234")
+    parser.read_line("// function: test 0x1234")
     assert len(parser.alerts) == 1
     assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER
 
diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py
index bbbf2d3a..a8882721 100644
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -41,8 +41,8 @@ def test_is_blank_or_comment(line: str, expected: bool):
     (True, False, "//  FUNCTION: ISLE 0xdeadbeef"),
     (True, False, "// FUNCTION:  ISLE 0xdeadbeef"),
     (True, False, "// FUNCTION: ISLE  0xdeadbeef"),
-    # Must have 0x prefix for hex number
-    (True, False, "// FUNCTION: ISLE deadbeef"),
+    # Must have 0x prefix for hex number to match at all
+    (False, False, "// FUNCTION: ISLE deadbeef"),
     # Offset, module name, and STUB must be uppercase
     (True, False, "// function: ISLE 0xdeadbeef"),
     (True, False, "// function: isle 0xdeadbeef"),

From a8387cd50d84aaa8c8d88e8fab1074618a96c50d Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:40:13 -0500
Subject: [PATCH 05/17] Count tab stops for indented functions to reduce
 MISSED_END_OF_FUNCTION noise

---
 tools/isledecomp/isledecomp/parser/parser.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
index aebe1c86..7a984b88 100644
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -95,6 +95,18 @@ def reset(self):
         self.fun_markers.empty()
         self.var_markers.empty()
         self.tbl_markers.empty()
+
+        # To handle functions that are entirely indented (i.e. those defined
+        # in class declarations), remember how many whitespace characters
+        # came before the opening curly brace and match that up at the end.
+        # This should give us the same or better accuracy for a well-formed file.
+        # The alternative is counting the curly braces on each line
+        # but that's probably too cumbersome.
+        self.curly_indent_stops = 0
+
+        # For non-synthetic functions, save the line number where the function begins
+        # (i.e. where we see the curly brace) along with the function signature.
+        # We will need both when we reach the end of the function.
         self.function_start: int = 0
         self.function_sig: str = ""
 
@@ -150,6 +162,7 @@ def _function_done(self, unexpected: bool = False):
             )
 
         self.fun_markers.empty()
+        self.curly_indent_stops = 0
         self.state = ReaderState.SEARCH
 
     def _vtable_marker(self, marker: DecompMarker):
@@ -308,13 +321,12 @@ def read_line(self, line: str):
 
         elif self.state == ReaderState.WANT_CURLY:
             if line.strip() == "{":
+                self.curly_indent_stops = line.index("{")
                 self._function_starts_here()
                 self.state = ReaderState.IN_FUNC
 
         elif self.state == ReaderState.IN_FUNC:
-            # Naive but reasonable assumption that functions will end with
-            # a curly brace on its own line with no prepended spaces.
-            if line.startswith("}"):
+            if line.strip().startswith("}") and line[self.curly_indent_stops] == "}":
                 self._function_done()
 
         elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):

From 8c815418d261ba8c5f67a9a2cae349fe4ac92db8 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:48:56 -0500
Subject: [PATCH 06/17] FUNCTION to SYNTHETIC where needed

---
 ISLE/isleapp.cpp                 | 2 +-
 LEGO1/legoentity.cpp             | 2 +-
 LEGO1/mxcore.h                   | 4 ++--
 LEGO1/mxloopingmidipresenter.cpp | 4 ++--
 LEGO1/mxstillpresenter.cpp       | 6 +++---
 LEGO1/realtime/vector.cpp        | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ISLE/isleapp.cpp b/ISLE/isleapp.cpp
index 1d4d4641..451f45ad 100644
--- a/ISLE/isleapp.cpp
+++ b/ISLE/isleapp.cpp
@@ -281,7 +281,7 @@ int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine
 	return msg.wParam;
 }
 
-// FUNCTION: ISLE 0x401c40
+// SYNTHETIC: ISLE 0x401c40
 // MxDSObject::SetAtomId
 
 // FUNCTION: ISLE 0x401ca0
diff --git a/LEGO1/legoentity.cpp b/LEGO1/legoentity.cpp
index 7e8a953f..edd1d18d 100644
--- a/LEGO1/legoentity.cpp
+++ b/LEGO1/legoentity.cpp
@@ -7,7 +7,7 @@
 
 DECOMP_SIZE_ASSERT(LegoEntity, 0x68)
 
-// FUNCTION: LEGO1 0x10001090
+// SYNTHETIC: LEGO1 0x10001090
 // LegoEntity::SetWorldSpeed
 
 // FUNCTION: LEGO1 0x1000c290
diff --git a/LEGO1/mxcore.h b/LEGO1/mxcore.h
index 4de691cc..518949c6 100644
--- a/LEGO1/mxcore.h
+++ b/LEGO1/mxcore.h
@@ -9,10 +9,10 @@
 class MxParam;
 
 // TODO: Find proper compilation unit to put these
-// FUNCTION: LEGO1 0x100140d0
+// SYNTHETIC: LEGO1 0x100140d0
 // MxCore::IsA
 
-// FUNCTION: LEGO1 0x100144c0
+// SYNTHETIC: LEGO1 0x100144c0
 // MxCore::ClassName
 
 // VTABLE: LEGO1 0x100dc0f8
diff --git a/LEGO1/mxloopingmidipresenter.cpp b/LEGO1/mxloopingmidipresenter.cpp
index 3d58e56b..55857701 100644
--- a/LEGO1/mxloopingmidipresenter.cpp
+++ b/LEGO1/mxloopingmidipresenter.cpp
@@ -7,10 +7,10 @@
 
 DECOMP_SIZE_ASSERT(MxLoopingMIDIPresenter, 0x58);
 
-// FUNCTION: LEGO1 0x100b1830
+// SYNTHETIC: LEGO1 0x100b1830
 // MxLoopingMIDIPresenter::ClassName
 
-// FUNCTION: LEGO1 0x100b1840
+// SYNTHETIC: LEGO1 0x100b1840
 // MxLoopingMIDIPresenter::IsA
 
 // SYNTHETIC: LEGO1 0x100b19c0
diff --git a/LEGO1/mxstillpresenter.cpp b/LEGO1/mxstillpresenter.cpp
index e3adaa90..1d07e74a 100644
--- a/LEGO1/mxstillpresenter.cpp
+++ b/LEGO1/mxstillpresenter.cpp
@@ -11,7 +11,7 @@ DECOMP_SIZE_ASSERT(MxStillPresenter, 0x6c);
 // GLOBAL: LEGO1 0x10101eb0
 const char* g_strBMP_ISMAP = "BMP_ISMAP";
 
-// FUNCTION: LEGO1 0x10043550
+// SYNTHETIC: LEGO1 0x10043550
 // MxStillPresenter::~MxStillPresenter
 
 // FUNCTION: LEGO1 0x100435b0
@@ -20,10 +20,10 @@ void MxStillPresenter::Destroy()
 	Destroy(FALSE);
 }
 
-// FUNCTION: LEGO1 0x100435c0
+// SYNTHETIC: LEGO1 0x100435c0
 // MxStillPresenter::ClassName
 
-// FUNCTION: LEGO1 0x100435d0
+// SYNTHETIC: LEGO1 0x100435d0
 // MxStillPresenter::IsA
 
 // SYNTHETIC: LEGO1 0x100436e0
diff --git a/LEGO1/realtime/vector.cpp b/LEGO1/realtime/vector.cpp
index 1d41470c..582d6443 100644
--- a/LEGO1/realtime/vector.cpp
+++ b/LEGO1/realtime/vector.cpp
@@ -60,7 +60,7 @@ float Vector2Impl::DotImpl(float* p_a, float* p_b) const
 	return p_b[0] * p_a[0] + p_b[1] * p_a[1];
 }
 
-// FUNCTION: LEGO1 0x10002060
+// SYNTHETIC: LEGO1 0x10002060
 // Vector2Impl::SetData
 
 // FUNCTION: LEGO1 0x10002070

From d87d665127fae7dd6e5bd48d9af14a0a829bf9e2 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:57:17 -0500
Subject: [PATCH 07/17] Missed marker conversion on SetAtomId

---
 LEGO1/mxdsobject.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LEGO1/mxdsobject.h b/LEGO1/mxdsobject.h
index d4563d9a..5410867b 100644
--- a/LEGO1/mxdsobject.h
+++ b/LEGO1/mxdsobject.h
@@ -7,7 +7,7 @@
 #include "mxdstypes.h"
 
 // TODO: Find proper compilation unit to put this
-// FUNCTION: LEGO1 0x10005530
+// SYNTHETIC: LEGO1 0x10005530
 // MxDSObject::SetAtomId
 
 // VTABLE: LEGO1 0x100dc868

From a4d92984da3687da8f3191828f6c3d5a604549a3 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 18:29:20 -0500
Subject: [PATCH 08/17] pylint cleanup, remove unused code

---
 tools/checkorder/checkorder.py                |  3 +-
 tools/isledecomp/isledecomp/parser/node.py    |  1 -
 tools/isledecomp/isledecomp/parser/parser.py  | 74 +++++++++----------
 tools/isledecomp/isledecomp/parser/util.py    |  1 -
 tools/isledecomp/tests/test_parser.py         |  5 +-
 tools/isledecomp/tests/test_parser_samples.py |  6 +-
 .../tests/test_parser_statechange.py          | 12 +--
 tools/isledecomp/tests/test_parser_util.py    |  2 -
 8 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/tools/checkorder/checkorder.py b/tools/checkorder/checkorder.py
index d2f0b23e..02636c09 100644
--- a/tools/checkorder/checkorder.py
+++ b/tools/checkorder/checkorder.py
@@ -19,7 +19,6 @@ def check_file(filename: str, verbose: bool = False) -> bool:
     with open(filename, "r", encoding="utf-8") as f:
         parser.read_lines(f)
 
-
     just_offsets = [block.offset for block in parser.functions]
     sorted_offsets = sorted(just_offsets)
     file_out_of_order = just_offsets != sorted_offsets
@@ -46,7 +45,7 @@ def check_file(filename: str, verbose: bool = False) -> bool:
                 msg = " ".join(
                     [
                         " " if fun.offset > prev_offset else "!",
-                        f"{block.offset:08x}",
+                        f"{fun.offset:08x}",
                         f"{fun.end_line - fun.line_number:4} lines",
                         f"{order_lookup[fun.offset]:3}",
                         "    ",
diff --git a/tools/isledecomp/isledecomp/parser/node.py b/tools/isledecomp/isledecomp/parser/node.py
index f9fbe3b5..96cc3362 100644
--- a/tools/isledecomp/isledecomp/parser/node.py
+++ b/tools/isledecomp/isledecomp/parser/node.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-from enum import Enum
 
 
 @dataclass
diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
index 7a984b88..772ae4ef 100644
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -1,6 +1,6 @@
 # C++ file parser
 
-from typing import List, TextIO, Iterable
+from typing import List, Iterable, Iterator
 from enum import Enum
 from .util import (
     DecompMarker,
@@ -12,7 +12,6 @@
 )
 from .node import (
     ParserAlert,
-    ParserNode,
     ParserFunction,
     ParserVariable,
     ParserVtable,
@@ -64,9 +63,8 @@ def insert(self, marker: DecompMarker) -> bool:
         self.markers[module] = (marker.type, marker.offset)
         return False
 
-    def iter(self):
-        for module in self.markers:
-            (marker_type, offset) = self.markers[module]
+    def iter(self) -> Iterator[DecompMarker]:
+        for module, (marker_type, offset) in self.markers.items():
             yield DecompMarker(marker_type, module, offset)
 
     def empty(self):
@@ -74,27 +72,26 @@ def empty(self):
 
 
 class DecompParser:
+    # pylint: disable=too-many-instance-attributes
+    # Could combine output lists into a single list to get under the limit,
+    # but not right now
     def __init__(self):
-        self.fun_markers = MarkerDict()
-        self.var_markers = MarkerDict()
-        self.tbl_markers = MarkerDict()
-        self.reset()
+        # The lists to be populated as we parse
+        self.functions: List[ParserFunction] = []
+        self.vtables: List[ParserVtable] = []
+        self.variables: List[ParserVariable] = []
+        self.alerts: List[ParserAlert] = []
 
-    def reset(self):
-        # Output values
-        self.functions = []
-        self.vtables = []
-        self.variables = []
-        self.alerts = []
-
-        # Internal state machine stuff
         self.line_number: int = 0
         self.state: ReaderState = ReaderState.SEARCH
 
         self.last_line: str = ""
-        self.fun_markers.empty()
-        self.var_markers.empty()
-        self.tbl_markers.empty()
+
+        # To allow for multiple markers where code is shared across different
+        # modules, save lists of compatible markers that appear in sequence
+        self.fun_markers = MarkerDict()
+        self.var_markers = MarkerDict()
+        self.tbl_markers = MarkerDict()
 
         # To handle functions that are entirely indented (i.e. those defined
         # in class declarations), remember how many whitespace characters
@@ -102,7 +99,7 @@ def reset(self):
         # This should give us the same or better accuracy for a well-formed file.
         # The alternative is counting the curly braces on each line
         # but that's probably too cumbersome.
-        self.curly_indent_stops = 0
+        self.curly_indent_stops: int = 0
 
         # For non-synthetic functions, save the line number where the function begins
         # (i.e. where we see the curly brace) along with the function signature.
@@ -110,6 +107,25 @@ def reset(self):
         self.function_start: int = 0
         self.function_sig: str = ""
 
+    def reset(self):
+        self.functions = []
+        self.vtables = []
+        self.variables = []
+        self.alerts = []
+
+        self.line_number = 0
+        self.state = ReaderState.SEARCH
+
+        self.last_line = ""
+
+        self.fun_markers.empty()
+        self.var_markers.empty()
+        self.tbl_markers.empty()
+
+        self.curly_indent_stops = 0
+        self.function_start = 0
+        self.function_sig = ""
+
     def _recover(self):
         """We hit a syntax error and need to reset temp structures"""
         self.state = ReaderState.SEARCH
@@ -340,19 +356,3 @@ def read_line(self, line: str):
     def read_lines(self, lines: Iterable):
         for line in lines:
             self.read_line(line)
-
-
-def find_code_blocks(stream: TextIO) -> List[ParserNode]:
-    """Read the IO stream (file) line-by-line and give the following report:
-    Foreach code block (function) in the file, what are its starting and
-    ending line numbers, and what is the given offset in the original
-    binary. We expect the result to be ordered by line number because we
-    are reading the file from start to finish."""
-
-    # TODO: this will be replaced shortly. shim for now to avoid
-    # making more changes elsewhere
-    p = DecompParser()
-    for line in stream:
-        p.read_line(line)
-
-    return p.functions
diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
index f93b4475..38515cfa 100644
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -1,7 +1,6 @@
 # C++ Parser utility functions and data structures
 from __future__ import annotations  # python <3.10 compatibility
 import re
-from typing import List
 from collections import namedtuple
 
 DecompMarker = namedtuple("DecompMarker", ["type", "module", "offset"])
diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
index 69764f74..fa5343dc 100644
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -3,12 +3,11 @@
     ReaderState,
     DecompParser,
 )
-from isledecomp.parser.util import DecompMarker
 from isledecomp.parser.error import ParserError
 
 
-@pytest.fixture
-def parser():
+@pytest.fixture(name="parser")
+def fixture_parser():
     return DecompParser()
 
 
diff --git a/tools/isledecomp/tests/test_parser_samples.py b/tools/isledecomp/tests/test_parser_samples.py
index a045e3cc..e74fda0e 100644
--- a/tools/isledecomp/tests/test_parser_samples.py
+++ b/tools/isledecomp/tests/test_parser_samples.py
@@ -1,6 +1,6 @@
 import os
-import pytest
 from typing import List, TextIO
+import pytest
 from isledecomp.parser import DecompParser
 from isledecomp.parser.node import ParserSymbol
 
@@ -20,8 +20,8 @@ def code_blocks_are_sorted(blocks: List[ParserSymbol]) -> bool:
     return just_offsets == sorted(just_offsets)
 
 
-@pytest.fixture
-def parser():
+@pytest.fixture(name="parser")
+def fixture_parser():
     return DecompParser()
 
 
diff --git a/tools/isledecomp/tests/test_parser_statechange.py b/tools/isledecomp/tests/test_parser_statechange.py
index 62c19175..714de579 100644
--- a/tools/isledecomp/tests/test_parser_statechange.py
+++ b/tools/isledecomp/tests/test_parser_statechange.py
@@ -3,7 +3,6 @@
     ReaderState as _rs,
     DecompParser,
 )
-from isledecomp.parser.util import DecompMarker
 from isledecomp.parser.error import ParserError as _pe
 
 # fmt: off
@@ -35,28 +34,28 @@
     (_rs.IN_TEMPLATE,     "SYNTHETIC",  _rs.IN_TEMPLATE,     None),
     (_rs.IN_TEMPLATE,     "TEMPLATE",   _rs.IN_TEMPLATE,     None),
     (_rs.IN_TEMPLATE,     "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
-    
+
     (_rs.WANT_CURLY,      "FUNCTION",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
     (_rs.WANT_CURLY,      "GLOBAL",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
     (_rs.WANT_CURLY,      "STUB",       _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
     (_rs.WANT_CURLY,      "SYNTHETIC",  _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
     (_rs.WANT_CURLY,      "TEMPLATE",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
     (_rs.WANT_CURLY,      "VTABLE",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
-    
+
     (_rs.IN_GLOBAL,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_GLOBAL,       "GLOBAL",     _rs.IN_GLOBAL,       None),
     (_rs.IN_GLOBAL,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_GLOBAL,       "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_GLOBAL,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_GLOBAL,       "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
-    
+
     (_rs.IN_FUNC_GLOBAL,  "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_FUNC_GLOBAL,  "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
     (_rs.IN_FUNC_GLOBAL,  "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_FUNC_GLOBAL,  "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_FUNC_GLOBAL,  "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_FUNC_GLOBAL,  "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
-    
+
     (_rs.IN_VTABLE,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_VTABLE,       "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
     (_rs.IN_VTABLE,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
@@ -75,7 +74,8 @@ def test_state_change_by_marker(
 ):
     p = DecompParser()
     p.state = state
-    p._handle_marker(DecompMarker(marker_type, "TEST", 0x1234))
+    mock_line = f"// {marker_type}: TEST 0x1234"
+    p.read_line(mock_line)
     assert p.state == new_state
 
     if expected_error is not None:
diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py
index a8882721..131a8ab3 100644
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -1,5 +1,3 @@
-from collections import namedtuple
-from typing import List
 import pytest
 from isledecomp.parser.parser import MarkerDict
 from isledecomp.parser.util import (

From 806720f5c9925b14bf153e969d67141c45d80377 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 19:42:36 -0500
Subject: [PATCH 09/17] Fix unexpected function end, add more unit tests

---
 tools/isledecomp/isledecomp/parser/parser.py |   8 +-
 tools/isledecomp/tests/test_parser.py        | 120 ++++++++++++++++++-
 2 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
index 772ae4ef..cea4523c 100644
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -162,7 +162,7 @@ def _synthetic_marker(self, marker: DecompMarker):
     def _function_done(self, unexpected: bool = False):
         end_line = self.line_number
         if unexpected:
-            end_line -= -1
+            end_line -= 1
 
         for marker in self.fun_markers.iter():
             self.functions.append(
@@ -249,7 +249,7 @@ def _handle_marker(self, marker: DecompMarker):
                 # We hit another offset unexpectedly.
                 # We can recover easily by just ending the function here.
                 self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
-                self._function_done()
+                self._function_done(unexpected=True)
 
                 # Start the next function right after so we can
                 # read the next line.
@@ -262,7 +262,7 @@ def _handle_marker(self, marker: DecompMarker):
                 self._synthetic_marker(marker)
             elif self.state == ReaderState.IN_FUNC:
                 self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
-                self._function_done()
+                self._function_done(unexpected=True)
                 self._synthetic_marker(marker)
             else:
                 self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
@@ -283,7 +283,7 @@ def _handle_marker(self, marker: DecompMarker):
                 self._vtable_marker(marker)
             elif self.state == ReaderState.IN_FUNC:
                 self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
-                self._function_done()
+                self._function_done(unexpected=True)
                 self._vtable_marker(marker)
             else:
                 self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
index fa5343dc..ce8ad307 100644
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -39,7 +39,8 @@ def test_invalid_marker(parser):
     assert parser.alerts[0].code == ParserError.BOGUS_MARKER
 
 
-def test_unexpected_marker(parser):
+def test_incompatible_marker(parser):
+    """The marker we just read cannot be handled in the current parser state"""
     parser.read_lines(
         [
             "// FUNCTION: TEST 0x1234",
@@ -52,6 +53,7 @@ def test_unexpected_marker(parser):
 
 
 def test_variable(parser):
+    """Should identify a global variable"""
     parser.read_lines(
         [
             "// GLOBAL: HELLO 0x1234",
@@ -62,7 +64,8 @@ def test_variable(parser):
 
 
 def test_synthetic_plus_marker(parser):
-    """Should fail with error and not log the synthetic"""
+    """Marker tracking preempts synthetic name detection.
+    Should fail with error and not log the synthetic"""
     parser.read_lines(
         [
             "// SYNTHETIC: HEY 0x555",
@@ -157,6 +160,21 @@ def test_multiple_variables(parser):
     assert len(parser.variables) == 2
 
 
+def test_multiple_variables_same_module(parser):
+    """Should not overwrite offset"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "// GLOBAL: HELLO 0x555",
+            "const char *g_greeting;",
+        ]
+    )
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
+    assert len(parser.variables) == 1
+    assert parser.variables[0].offset == 0x1234
+
+
 def test_multiple_vtables(parser):
     parser.read_lines(
         [
@@ -167,3 +185,101 @@ def test_multiple_vtables(parser):
     )
     assert len(parser.alerts) == 0
     assert len(parser.vtables) == 2
+
+
+def test_multiple_vtables_same_module(parser):
+    """Should not overwrite offset"""
+    parser.read_lines(
+        [
+            "// VTABLE: HELLO 0x1234",
+            "// VTABLE: HELLO 0x5432",
+            "class MxString : public MxCore {",
+        ]
+    )
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
+    assert len(parser.vtables) == 1
+    assert parser.vtables[0].offset == 0x1234
+
+
+def test_synthetic(parser):
+    parser.read_lines(
+        [
+            "// SYNTHETIC: TEST 0x1234",
+            "// TestClass::TestMethod",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert parser.functions[0].is_template is True
+    assert parser.functions[0].name == "TestClass::TestMethod"
+
+
+def test_synthetic_same_module(parser):
+    parser.read_lines(
+        [
+            "// SYNTHETIC: TEST 0x1234",
+            "// SYNTHETIC: TEST 0x555",
+            "// TestClass::TestMethod",
+        ]
+    )
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
+    assert len(parser.functions) == 1
+    assert parser.functions[0].offset == 0x1234
+
+
+@pytest.mark.skip(reason="todo")
+def test_synthetic_no_comment(parser):
+    """Synthetic marker followed by a code line (i.e. non-comment)"""
+    parser.read_lines(
+        [
+            "// SYNTHETIC: TEST 0x1234",
+            "int x = 123;",
+        ]
+    )
+    assert len(parser.functions) == 0
+
+
+def test_single_line_function(parser):
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "int hello() { return 1234; }",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert parser.functions[0].line_number == 2
+    assert parser.functions[0].end_line == 2
+
+
+def test_indented_function(parser):
+    """Track the number of whitespace characters when we begin the function
+    and check that against each closing curly brace we read.
+    Should not report a syntax warning if the function is indented"""
+    parser.read_lines(
+        [
+            "    // FUNCTION: TEST 0x1234",
+            "    void indented()",
+            "    {",
+            "        // TODO",
+            "    }",
+            "    // FUNCTION: NEXT 0x555",
+        ]
+    )
+    assert len(parser.alerts) == 0
+
+
+@pytest.mark.xfail(reason="todo")
+def test_indented_no_curly_hint(parser):
+    """Same as above, but opening curly brace is on the same line.
+    Without the hint of how many whitespace characters to check, can we
+    still identify the end of the function?"""
+    parser.read_lines(
+        [
+            "    // FUNCTION: TEST 0x1234",
+            "    void indented() {",
+            "    }",
+            "    // FUNCTION: NEXT 0x555",
+        ]
+    )
+    assert len(parser.alerts) == 0

From 9947231f6d48612e9ab0bae2b7da07643cac1890 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Fri, 1 Dec 2023 19:58:40 -0500
Subject: [PATCH 10/17] Be more strict about synthetic name syntax

---
 tools/isledecomp/isledecomp/parser/error.py  |  6 +++---
 tools/isledecomp/isledecomp/parser/parser.py | 12 ++++++++----
 tools/isledecomp/isledecomp/parser/util.py   |  8 ++++----
 tools/isledecomp/tests/test_parser.py        |  4 +++-
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/tools/isledecomp/isledecomp/parser/error.py b/tools/isledecomp/isledecomp/parser/error.py
index 8bda90da..dce99448 100644
--- a/tools/isledecomp/isledecomp/parser/error.py
+++ b/tools/isledecomp/isledecomp/parser/error.py
@@ -18,9 +18,6 @@ class ParserError(Enum):
     # to handle it
     BOGUS_MARKER = 104
 
-    # WARN: Under a synthetic marker we expected a comment but found a code line instead
-    SYNTHETIC_NOT_COMMENT = 110
-
     # WARN: New function marker appeared while we were inside a function
     MISSED_END_OF_FUNCTION = 117
 
@@ -31,3 +28,6 @@ class ParserError(Enum):
     # with the preceding markers.
     # For example, a GLOBAL cannot follow FUNCTION/STUB
     INCOMPATIBLE_MARKER = 201
+
+    # ERROR: The line following a synthetic marker was not a comment
+    BAD_SYNTHETIC = 202
diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
index cea4523c..24dc0b23 100644
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -7,7 +7,7 @@
     is_blank_or_comment,
     match_marker,
     is_marker_exact,
-    get_template_function_name,
+    get_synthetic_name,
     remove_trailing_comment,
 )
 from .node import (
@@ -307,9 +307,13 @@ def read_line(self, line: str):
         if self.state == ReaderState.IN_TEMPLATE:
             # TEMPLATE functions are a special case. The signature is
             # given on the next line (in a // comment)
-            self.function_sig = get_template_function_name(line)
-            self._function_starts_here()
-            self._function_done()
+            name = get_synthetic_name(line)
+            if name is None:
+                self._syntax_error(ParserError.BAD_SYNTHETIC)
+            else:
+                self.function_sig = name
+                self._function_starts_here()
+                self._function_done()
 
         elif self.state == ReaderState.WANT_SIG:
             # Skip blank lines or comments that come after the offset
diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
index 38515cfa..e0b25d85 100644
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -23,15 +23,15 @@
 trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
 
 
-def get_template_function_name(line: str) -> str:
-    """Parse function signature for special TEMPLATE functions"""
+def get_synthetic_name(line: str) -> str | None:
+    """Synthetic names appear on a single line comment on the line after the marker.
+    If that's not what we have, return None"""
     template_match = templateCommentRegex.match(line)
 
-    # If we don't match, you get whatever is on the line as the signature
     if template_match is not None:
         return template_match.group(1)
 
-    return line
+    return None
 
 
 def remove_trailing_comment(line: str) -> str:
diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
index ce8ad307..222a9ac1 100644
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -228,7 +228,6 @@ def test_synthetic_same_module(parser):
     assert parser.functions[0].offset == 0x1234
 
 
-@pytest.mark.skip(reason="todo")
 def test_synthetic_no_comment(parser):
     """Synthetic marker followed by a code line (i.e. non-comment)"""
     parser.read_lines(
@@ -238,6 +237,9 @@ def test_synthetic_no_comment(parser):
         ]
     )
     assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BAD_SYNTHETIC
+    assert parser.state == ReaderState.SEARCH
 
 
 def test_single_line_function(parser):

From 97468dcd8231683aec72bce8d0b5927d73b6014a Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sat, 2 Dec 2023 12:30:48 -0500
Subject: [PATCH 11/17] Revert "Missed marker conversion on SetAtomId"

This reverts commit d87d665127fae7dd6e5bd48d9af14a0a829bf9e2.
---
 LEGO1/mxdsobject.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LEGO1/mxdsobject.h b/LEGO1/mxdsobject.h
index 5410867b..d4563d9a 100644
--- a/LEGO1/mxdsobject.h
+++ b/LEGO1/mxdsobject.h
@@ -7,7 +7,7 @@
 #include "mxdstypes.h"
 
 // TODO: Find proper compilation unit to put this
-// SYNTHETIC: LEGO1 0x10005530
+// FUNCTION: LEGO1 0x10005530
 // MxDSObject::SetAtomId
 
 // VTABLE: LEGO1 0x100dc868

From ba4e28b8c48b157a6f4ec64427f6814f89857f97 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sat, 2 Dec 2023 12:31:09 -0500
Subject: [PATCH 12/17] Revert "FUNCTION to SYNTHETIC where needed"

This reverts commit 8c815418d261ba8c5f67a9a2cae349fe4ac92db8.
---
 ISLE/isleapp.cpp                 | 2 +-
 LEGO1/legoentity.cpp             | 2 +-
 LEGO1/mxcore.h                   | 4 ++--
 LEGO1/mxloopingmidipresenter.cpp | 4 ++--
 LEGO1/mxstillpresenter.cpp       | 6 +++---
 LEGO1/realtime/vector.cpp        | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ISLE/isleapp.cpp b/ISLE/isleapp.cpp
index 451f45ad..1d4d4641 100644
--- a/ISLE/isleapp.cpp
+++ b/ISLE/isleapp.cpp
@@ -281,7 +281,7 @@ int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine
 	return msg.wParam;
 }
 
-// SYNTHETIC: ISLE 0x401c40
+// FUNCTION: ISLE 0x401c40
 // MxDSObject::SetAtomId
 
 // FUNCTION: ISLE 0x401ca0
diff --git a/LEGO1/legoentity.cpp b/LEGO1/legoentity.cpp
index edd1d18d..7e8a953f 100644
--- a/LEGO1/legoentity.cpp
+++ b/LEGO1/legoentity.cpp
@@ -7,7 +7,7 @@
 
 DECOMP_SIZE_ASSERT(LegoEntity, 0x68)
 
-// SYNTHETIC: LEGO1 0x10001090
+// FUNCTION: LEGO1 0x10001090
 // LegoEntity::SetWorldSpeed
 
 // FUNCTION: LEGO1 0x1000c290
diff --git a/LEGO1/mxcore.h b/LEGO1/mxcore.h
index 518949c6..4de691cc 100644
--- a/LEGO1/mxcore.h
+++ b/LEGO1/mxcore.h
@@ -9,10 +9,10 @@
 class MxParam;
 
 // TODO: Find proper compilation unit to put these
-// SYNTHETIC: LEGO1 0x100140d0
+// FUNCTION: LEGO1 0x100140d0
 // MxCore::IsA
 
-// SYNTHETIC: LEGO1 0x100144c0
+// FUNCTION: LEGO1 0x100144c0
 // MxCore::ClassName
 
 // VTABLE: LEGO1 0x100dc0f8
diff --git a/LEGO1/mxloopingmidipresenter.cpp b/LEGO1/mxloopingmidipresenter.cpp
index 55857701..3d58e56b 100644
--- a/LEGO1/mxloopingmidipresenter.cpp
+++ b/LEGO1/mxloopingmidipresenter.cpp
@@ -7,10 +7,10 @@
 
 DECOMP_SIZE_ASSERT(MxLoopingMIDIPresenter, 0x58);
 
-// SYNTHETIC: LEGO1 0x100b1830
+// FUNCTION: LEGO1 0x100b1830
 // MxLoopingMIDIPresenter::ClassName
 
-// SYNTHETIC: LEGO1 0x100b1840
+// FUNCTION: LEGO1 0x100b1840
 // MxLoopingMIDIPresenter::IsA
 
 // SYNTHETIC: LEGO1 0x100b19c0
diff --git a/LEGO1/mxstillpresenter.cpp b/LEGO1/mxstillpresenter.cpp
index 1d07e74a..e3adaa90 100644
--- a/LEGO1/mxstillpresenter.cpp
+++ b/LEGO1/mxstillpresenter.cpp
@@ -11,7 +11,7 @@ DECOMP_SIZE_ASSERT(MxStillPresenter, 0x6c);
 // GLOBAL: LEGO1 0x10101eb0
 const char* g_strBMP_ISMAP = "BMP_ISMAP";
 
-// SYNTHETIC: LEGO1 0x10043550
+// FUNCTION: LEGO1 0x10043550
 // MxStillPresenter::~MxStillPresenter
 
 // FUNCTION: LEGO1 0x100435b0
@@ -20,10 +20,10 @@ void MxStillPresenter::Destroy()
 	Destroy(FALSE);
 }
 
-// SYNTHETIC: LEGO1 0x100435c0
+// FUNCTION: LEGO1 0x100435c0
 // MxStillPresenter::ClassName
 
-// SYNTHETIC: LEGO1 0x100435d0
+// FUNCTION: LEGO1 0x100435d0
 // MxStillPresenter::IsA
 
 // SYNTHETIC: LEGO1 0x100436e0
diff --git a/LEGO1/realtime/vector.cpp b/LEGO1/realtime/vector.cpp
index 582d6443..1d41470c 100644
--- a/LEGO1/realtime/vector.cpp
+++ b/LEGO1/realtime/vector.cpp
@@ -60,7 +60,7 @@ float Vector2Impl::DotImpl(float* p_a, float* p_b) const
 	return p_b[0] * p_a[0] + p_b[1] * p_a[1];
 }
 
-// SYNTHETIC: LEGO1 0x10002060
+// FUNCTION: LEGO1 0x10002060
 // Vector2Impl::SetData
 
 // FUNCTION: LEGO1 0x10002070

From db971ada604038d6f27b5930cce919048e1ba8fd Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sat, 2 Dec 2023 13:07:11 -0500
Subject: [PATCH 13/17] Implicit lookup by name for functions

---
 tools/isledecomp/isledecomp/parser/error.py  | 10 ++-
 tools/isledecomp/isledecomp/parser/node.py   |  1 +
 tools/isledecomp/isledecomp/parser/parser.py | 50 ++++++++---
 tools/isledecomp/tests/test_parser.py        | 88 ++++++++++++++++++--
 tools/reccmp/reccmp.py                       |  2 +-
 5 files changed, 129 insertions(+), 22 deletions(-)

diff --git a/tools/isledecomp/isledecomp/parser/error.py b/tools/isledecomp/isledecomp/parser/error.py
index dce99448..c18e3e29 100644
--- a/tools/isledecomp/isledecomp/parser/error.py
+++ b/tools/isledecomp/isledecomp/parser/error.py
@@ -19,7 +19,15 @@ class ParserError(Enum):
     BOGUS_MARKER = 104
 
     # WARN: New function marker appeared while we were inside a function
-    MISSED_END_OF_FUNCTION = 117
+    MISSED_END_OF_FUNCTION = 105
+
+    # WARN: If we find a curly brace right after the function declaration
+    # this is wrong but we still have enough to make a match with reccmp
+    MISSED_START_OF_FUNCTION = 106
+
+    # WARN: A blank line appeared between the end of FUNCTION markers
+    # and the start of the function. We can ignore it, but the line shouldn't be there
+    UNEXPECTED_BLANK_LINE = 107
 
     # ERROR: We found a marker unexpectedly
     UNEXPECTED_MARKER = 200
diff --git a/tools/isledecomp/isledecomp/parser/node.py b/tools/isledecomp/isledecomp/parser/node.py
index 96cc3362..0ee87000 100644
--- a/tools/isledecomp/isledecomp/parser/node.py
+++ b/tools/isledecomp/isledecomp/parser/node.py
@@ -21,6 +21,7 @@ class ParserSymbol(ParserNode):
 @dataclass
 class ParserFunction(ParserSymbol):
     name: str
+    lookup_by_name: bool = False
     is_stub: bool = False
     is_synthetic: bool = False
     is_template: bool = False
diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
index 24dc0b23..37e8d386 100644
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -42,6 +42,10 @@ def marker_is_synthetic(marker: DecompMarker) -> bool:
     return marker.type.upper() in ("SYNTHETIC", "TEMPLATE")
 
 
+def marker_is_template(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "TEMPLATE"
+
+
 def marker_is_function(marker: DecompMarker) -> bool:
     return marker.type.upper() in ("FUNCTION", "STUB")
 
@@ -55,8 +59,8 @@ def __init__(self):
         self.markers: dict = {}
 
     def insert(self, marker: DecompMarker) -> bool:
+        """Return True if this insert would overwrite"""
         module = marker.module.upper()
-        # Return True if this insert would overwrite
         if module in self.markers:
             return True
 
@@ -159,9 +163,12 @@ def _synthetic_marker(self, marker: DecompMarker):
             self._syntax_warning(ParserError.DUPLICATE_MODULE)
         self.state = ReaderState.IN_TEMPLATE
 
-    def _function_done(self, unexpected: bool = False):
+    def _function_done(self, lookup_by_name: bool = False, unexpected: bool = False):
         end_line = self.line_number
         if unexpected:
+            # If we missed the end of the previous function, assume it ended
+            # on the previous line and that whatever we are tracking next
+            # begins on the current line.
             end_line -= 1
 
         for marker in self.fun_markers.iter():
@@ -170,8 +177,10 @@ def _function_done(self, unexpected: bool = False):
                     line_number=self.function_start,
                     module=marker.module,
                     offset=marker.offset,
+                    lookup_by_name=lookup_by_name,
                     is_stub=marker_is_stub(marker),
-                    is_template=marker_is_synthetic(marker),
+                    is_synthetic=marker_is_synthetic(marker),
+                    is_template=marker_is_template(marker),
                     name=self.function_sig,
                     end_line=end_line,
                 )
@@ -262,7 +271,7 @@ def _handle_marker(self, marker: DecompMarker):
                 self._synthetic_marker(marker)
             elif self.state == ReaderState.IN_FUNC:
                 self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
-                self._function_done(unexpected=True)
+                self._function_done(lookup_by_name=True, unexpected=True)
                 self._synthetic_marker(marker)
             else:
                 self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
@@ -304,6 +313,7 @@ def read_line(self, line: str):
             self._handle_marker(marker)
             return
 
+        line_strip = line.strip()
         if self.state == ReaderState.IN_TEMPLATE:
             # TEMPLATE functions are a special case. The signature is
             # given on the next line (in a // comment)
@@ -313,16 +323,32 @@ def read_line(self, line: str):
             else:
                 self.function_sig = name
                 self._function_starts_here()
-                self._function_done()
+                self._function_done(lookup_by_name=True)
 
         elif self.state == ReaderState.WANT_SIG:
-            # Skip blank lines or comments that come after the offset
-            # marker. There is not a formal procedure for this, so just
-            # assume the next "code line" is the function signature
-            if not is_blank_or_comment(line):
+            # Ignore blanks on the way to function start or function name
+            if len(line_strip) == 0:
+                self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
+
+            elif line_strip.startswith("//"):
+                # If we found a comment, assume implicit lookup-by-name
+                # function and end here. We know this is not a decomp marker
+                # because it would have been handled already.
+                self.function_sig = get_synthetic_name(line)
+                self._function_starts_here()
+                self._function_done(lookup_by_name=True)
+
+            elif line_strip == "{":
+                # We missed the function signature but we can recover from this
+                self.function_sig = "(unknown)"
+                self._function_starts_here()
+                self._syntax_warning(ParserError.MISSED_START_OF_FUNCTION)
+                self.state = ReaderState.IN_FUNC
+
+            else:
                 # Inline functions may end with a comment. Strip that out
                 # to help parsing.
-                self.function_sig = remove_trailing_comment(line.strip())
+                self.function_sig = remove_trailing_comment(line_strip)
 
                 # Now check to see if the opening curly bracket is on the
                 # same line. clang-format should prevent this (BraceWrapping)
@@ -340,13 +366,13 @@ def read_line(self, line: str):
                     self.state = ReaderState.WANT_CURLY
 
         elif self.state == ReaderState.WANT_CURLY:
-            if line.strip() == "{":
+            if line_strip == "{":
                 self.curly_indent_stops = line.index("{")
                 self._function_starts_here()
                 self.state = ReaderState.IN_FUNC
 
         elif self.state == ReaderState.IN_FUNC:
-            if line.strip().startswith("}") and line[self.curly_indent_stops] == "}":
+            if line_strip.startswith("}") and line[self.curly_indent_stops] == "}":
                 self._function_done()
 
         elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):
diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
index 222a9ac1..7954aee1 100644
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -11,15 +11,23 @@ def fixture_parser():
     return DecompParser()
 
 
-@pytest.mark.skip(reason="todo")
 def test_missing_sig(parser):
-    """Bad syntax: function signature is missing"""
-    parser.read_lines(["// FUNCTION: TEST 0x1234", "{"])
-    assert parser.state == ReaderState.IN_FUNC
-    assert len(parser.alerts) == 1
-    parser.read_line("}")
+    """In the hopefully rare scenario that the function signature and marker
+    are swapped, we still have enough to match witch reccmp"""
+    parser.read_lines(
+        [
+            "void my_function()",
+            "// FUNCTION: TEST 0x1234",
+            "{",
+            "}",
+        ]
+    )
+    assert parser.state == ReaderState.SEARCH
     assert len(parser.functions) == 1
-    assert parser.functions[0] != "{"
+    assert parser.functions[0].line_number == 3
+
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.MISSED_START_OF_FUNCTION
 
 
 def test_not_exact_syntax(parser):
@@ -210,7 +218,7 @@ def test_synthetic(parser):
         ]
     )
     assert len(parser.functions) == 1
-    assert parser.functions[0].is_template is True
+    assert parser.functions[0].lookup_by_name is True
     assert parser.functions[0].name == "TestClass::TestMethod"
 
 
@@ -285,3 +293,67 @@ def test_indented_no_curly_hint(parser):
         ]
     )
     assert len(parser.alerts) == 0
+
+
+def test_implicit_lookup_by_name(parser):
+    """FUNCTION (or STUB) offsets must directly precede the function signature.
+    If we detect a comment instead, we assume that this is a lookup-by-name
+    function and end here."""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "// TestClass::TestMethod()",
+        ]
+    )
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.functions) == 1
+    assert parser.functions[0].lookup_by_name is True
+    assert parser.functions[0].name == "TestClass::TestMethod()"
+
+
+def test_function_with_spaces(parser):
+    """There should not be any spaces between the end of FUNCTION markers
+    and the start or name of the function. If it's a blank line, we can safely
+    ignore but should alert to this."""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "   ",
+            "inline void test_function() { };",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
+
+
+def test_function_with_spaces_implicit(parser):
+    """Same as above, but for implicit lookup-by-name"""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "   ",
+            "// Implicit::Method",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
+
+
+@pytest.mark.xfail(reason="will assume implicit lookup-by-name function")
+def test_function_is_commented(parser):
+    """In an ideal world, we would recognize that there is no code here.
+    Some editors (or users) might comment the function on each line like this
+    but hopefully it is rare."""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "// int my_function()",
+            "// {",
+            "//     return 5;",
+            "// }",
+        ]
+    )
+
+    assert len(parser.functions) == 0
diff --git a/tools/reccmp/reccmp.py b/tools/reccmp/reccmp.py
index 1d4cd5b1..f7659703 100755
--- a/tools/reccmp/reccmp.py
+++ b/tools/reccmp/reccmp.py
@@ -334,7 +334,7 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
                     else:
                         continue
 
-                if fun.is_template:
+                if fun.lookup_by_name:
                     recinfo = syminfo.get_recompiled_address_from_name(fun.name)
                     if not recinfo:
                         continue

From 9f1302e8d807b6efd871dfee9dcb753488314889 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sat, 2 Dec 2023 18:31:46 -0500
Subject: [PATCH 14/17] Fix VTABLE SYNTHETIC and other decomp markers

---
 LEGO1/legoinputmanager.h       |  6 +++---
 LEGO1/legopathcontrollerlist.h |  6 +++---
 LEGO1/mxdirect3d.h             |  3 ++-
 LEGO1/mxdsactionlist.h         |  4 ++--
 LEGO1/mxpalette.cpp            |  3 ++-
 LEGO1/mxpresenterlist.h        |  6 +++---
 LEGO1/mxregionlist.h           | 14 ++++++++------
 LEGO1/mxstreamchunklist.h      |  4 ++--
 LEGO1/mxstreamlist.cpp         |  2 +-
 LEGO1/mxvariabletable.h        |  4 ++--
 10 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/LEGO1/legoinputmanager.h b/LEGO1/legoinputmanager.h
index a6e34a44..9a4fe6e7 100644
--- a/LEGO1/legoinputmanager.h
+++ b/LEGO1/legoinputmanager.h
@@ -12,13 +12,13 @@
 
 class LegoControlManager;
 
-// VTABLE: LEGO1 0x100d87b8 SYNTHETIC
+// VTABLE: LEGO1 0x100d87b8
 // class MxCollection<LegoEventNotificationParam>
 
-// VTABLE: LEGO1 0x100d87d0 SYNTHETIC
+// VTABLE: LEGO1 0x100d87d0
 // class MxList<LegoEventNotificationParam>
 
-// VTABLE: LEGO1 0x100d87e8 SYNTHETIC
+// VTABLE: LEGO1 0x100d87e8
 // class MxQueue<LegoEventNotificationParam>
 
 // VTABLE: LEGO1 0x100d8800
diff --git a/LEGO1/legopathcontrollerlist.h b/LEGO1/legopathcontrollerlist.h
index dbda40e9..5a7d4c60 100644
--- a/LEGO1/legopathcontrollerlist.h
+++ b/LEGO1/legopathcontrollerlist.h
@@ -5,7 +5,7 @@
 #include "mxlist.h"
 #include "mxtypes.h"
 
-// VTABLE: LEGO1 0x100d6320 SYNTHETIC
+// VTABLE: LEGO1 0x100d6320
 // class MxPtrList<LegoPathController>
 
 // VTABLE: LEGO1 0x100d6338
@@ -17,10 +17,10 @@ class LegoPathControllerList : public MxPtrList<LegoPathController> {
 	static void Destroy(LegoPathController*);
 };
 
-// VTABLE: LEGO1 0x100d6380 SYNTHETIC
+// VTABLE: LEGO1 0x100d6380
 // class MxCollection<LegoPathController *>
 
-// VTABLE: LEGO1 0x100d6398 SYNTHETIC
+// VTABLE: LEGO1 0x100d6398
 // class MxList<LegoPathController *>
 
 #endif // LEGOPATHCONTROLLERLIST_H
diff --git a/LEGO1/mxdirect3d.h b/LEGO1/mxdirect3d.h
index 0d0b48fd..f038ee7f 100644
--- a/LEGO1/mxdirect3d.h
+++ b/LEGO1/mxdirect3d.h
@@ -17,7 +17,8 @@ class MxDeviceModeFinder {
 	MxDirectDraw::DeviceModesInfo* m_deviceInfo; // +0xe0
 };
 
-// VTABLE: LEGO1 0x100db814 (or 0x100d9cc8?)
+// VTABLE: LEGO1 0x100db814
+// or is it 0x100d9cc8?
 // SIZE 0x198
 class MxDeviceEnumerate {
 public:
diff --git a/LEGO1/mxdsactionlist.h b/LEGO1/mxdsactionlist.h
index 7050dc53..19929d20 100644
--- a/LEGO1/mxdsactionlist.h
+++ b/LEGO1/mxdsactionlist.h
@@ -6,10 +6,10 @@
 
 class MxDSAction;
 
-// VTABLE: LEGO1 0x100dcea8 SYNTHETIC
+// VTABLE: LEGO1 0x100dcea8
 // class MxCollection<MxDSAction *>
 
-// VTABLE: LEGO1 0x100dcec0 SYNTHETIC
+// VTABLE: LEGO1 0x100dcec0
 // class MxList<MxDSAction *>
 
 // VTABLE: LEGO1 0x100dced8
diff --git a/LEGO1/mxpalette.cpp b/LEGO1/mxpalette.cpp
index b5b30257..aefe57c2 100644
--- a/LEGO1/mxpalette.cpp
+++ b/LEGO1/mxpalette.cpp
@@ -3,7 +3,8 @@
 #include "mxomni.h"
 #include "mxvideomanager.h"
 
-// GLOBAL: LEGO1 0x10102188 0x400
+// GLOBAL: LEGO1 0x10102188
+// SIZE: 0x400
 PALETTEENTRY g_defaultPaletteEntries[256] = {
 	{0u, 0u, 0u, 0u},       {128u, 0u, 0u, 0u},     {0u, 128u, 0u, 0u},     {128u, 128u, 0u, 0u},
 	{0u, 0u, 128u, 0u},     {128u, 0u, 128u, 0u},   {0u, 128u, 128u, 0u},   {128u, 128u, 128u, 0u},
diff --git a/LEGO1/mxpresenterlist.h b/LEGO1/mxpresenterlist.h
index 6b84ffab..d96374b9 100644
--- a/LEGO1/mxpresenterlist.h
+++ b/LEGO1/mxpresenterlist.h
@@ -5,7 +5,7 @@
 
 class MxPresenter;
 
-// VTABLE: LEGO1 0x100d62f0 SYNTHETIC
+// VTABLE: LEGO1 0x100d62f0
 // class MxPtrList<MxPresenter>
 
 // VTABLE: LEGO1 0x100d6308
@@ -17,10 +17,10 @@ class MxPresenterList : public MxPtrList<MxPresenter> {
 
 typedef MxListCursorChildChild<MxPresenter*> MxPresenterListCursor;
 
-// VTABLE: LEGO1 0x100d6350 SYNTHETIC
+// VTABLE: LEGO1 0x100d6350
 // class MxCollection<MxPresenter *>
 
-// VTABLE: LEGO1 0x100d6368 SYNTHETIC
+// VTABLE: LEGO1 0x100d6368
 // class MxList<MxPresenter *>
 
 #endif // MXPRESENTERLIST_H
diff --git a/LEGO1/mxregionlist.h b/LEGO1/mxregionlist.h
index 48bf9050..3aa0487b 100644
--- a/LEGO1/mxregionlist.h
+++ b/LEGO1/mxregionlist.h
@@ -6,13 +6,13 @@
 struct MxRegionTopBottom;
 struct MxRegionLeftRight;
 
-// VTABLE: LEGO1 0x100dcb10 SYNTHETIC
+// VTABLE: LEGO1 0x100dcb10
 // class MxCollection<MxRegionTopBottom *>
 
-// VTABLE: LEGO1 0x100dcb28 SYNTHETIC
+// VTABLE: LEGO1 0x100dcb28
 // class MxList<MxRegionTopBottom *>
 
-// VTABLE: LEGO1 0x100dcb40 SYNTHETIC
+// VTABLE: LEGO1 0x100dcb40
 // class MxPtrList<MxRegionTopBottom>
 
 // VTABLE: LEGO1 0x100dcb58
@@ -24,18 +24,20 @@ class MxRegionList : public MxPtrList<MxRegionTopBottom> {
 };
 
 // VTABLE: LEGO1 0x100dcb88
+// class MxListCursorChildChild<MxRegionTopBottom *>
 typedef MxListCursorChildChild<MxRegionTopBottom*> MxRegionListCursor;
 
 // VTABLE: LEGO1 0x100dcc10
+// class MxListCursorChildChild<MxRegionLeftRight *>
 typedef MxListCursorChildChild<MxRegionLeftRight*> MxRegionLeftRightListCursor;
 
-// VTABLE: LEGO1 0x100dcc40 SYNTHETIC
+// VTABLE: LEGO1 0x100dcc40
 // class MxCollection<MxRegionLeftRight *>
 
-// VTABLE: LEGO1 0x100dcc58 SYNTHETIC
+// VTABLE: LEGO1 0x100dcc58
 // class MxList<MxRegionLeftRight *>
 
-// VTABLE: LEGO1 0x100dcc70 SYNTHETIC
+// VTABLE: LEGO1 0x100dcc70
 // class MxPtrList<MxRegionLeftRight>
 
 // VTABLE: LEGO1 0x100dcc88
diff --git a/LEGO1/mxstreamchunklist.h b/LEGO1/mxstreamchunklist.h
index d8d9f170..4924b6ff 100644
--- a/LEGO1/mxstreamchunklist.h
+++ b/LEGO1/mxstreamchunklist.h
@@ -6,10 +6,10 @@
 
 class MxStreamChunk;
 
-// VTABLE: LEGO1 0x100dc5d0 SYNTHETIC
+// VTABLE: LEGO1 0x100dc5d0
 // class MxCollection<MxStreamChunk *>
 
-// VTABLE: LEGO1 0x100dc5e8 SYNTHETIC
+// VTABLE: LEGO1 0x100dc5e8
 // class MxList<MxStreamChunk *>
 
 // VTABLE: LEGO1 0x100dc600
diff --git a/LEGO1/mxstreamlist.cpp b/LEGO1/mxstreamlist.cpp
index 0d7d7051..1fa4dd48 100644
--- a/LEGO1/mxstreamlist.cpp
+++ b/LEGO1/mxstreamlist.cpp
@@ -8,7 +8,7 @@ DECOMP_SIZE_ASSERT(MxStreamListMxDSSubscriber, 0xc);
 // FUNCTION: LEGO1 0x100bfa80
 MxDSAction* MxStreamListMxDSAction::Find(MxDSAction* p_action, MxBool p_delete)
 {
-	// DECOMP: ALPHA 0x1008b99d ?
+	// DECOMP ALPHA 0x1008b99d ?
 
 	MxDSAction* found = NULL;
 
diff --git a/LEGO1/mxvariabletable.h b/LEGO1/mxvariabletable.h
index 7da6e0fc..1a91da26 100644
--- a/LEGO1/mxvariabletable.h
+++ b/LEGO1/mxvariabletable.h
@@ -20,10 +20,10 @@ class MxVariableTable : public MxHashTable<MxVariable*> {
 	virtual MxU32 Hash(MxVariable*) override;                // vtable+0x18
 };
 
-// VTABLE: LEGO1 0x100dc1b0 SYNTHETIC
+// VTABLE: LEGO1 0x100dc1b0
 // class MxCollection<MxVariable *>
 
-// VTABLE: LEGO1 0x100dc1e8 SYNTHETIC
+// VTABLE: LEGO1 0x100dc1e8
 // class MxHashTable<MxVariable *>
 
 #endif // MXVARIABLETABLE_H

From cbeb4168e0603bd31abf3ab920359adcf2cf43cf Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sat, 2 Dec 2023 18:32:39 -0500
Subject: [PATCH 15/17] Get vtable class name

---
 tools/isledecomp/isledecomp/parser/parser.py | 14 +++++---
 tools/isledecomp/isledecomp/parser/util.py   | 30 +++++++++++++++++
 tools/isledecomp/tests/test_parser.py        |  1 +
 tools/isledecomp/tests/test_parser_util.py   | 34 ++++++++++++++++++++
 4 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
index 37e8d386..89b79928 100644
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -7,6 +7,7 @@
     is_blank_or_comment,
     match_marker,
     is_marker_exact,
+    get_class_name,
     get_synthetic_name,
     remove_trailing_comment,
 )
@@ -195,14 +196,18 @@ def _vtable_marker(self, marker: DecompMarker):
             self._syntax_warning(ParserError.DUPLICATE_MODULE)
         self.state = ReaderState.IN_VTABLE
 
-    def _vtable_done(self):
+    def _vtable_done(self, class_name: str = None):
+        if class_name is None:
+            # Best we can do
+            class_name = self.last_line.strip()
+
         for marker in self.tbl_markers.iter():
             self.vtables.append(
                 ParserVtable(
                     line_number=self.line_number,
                     module=marker.module,
                     offset=marker.offset,
-                    class_name=self.last_line.strip(),
+                    class_name=class_name,
                 )
             )
 
@@ -380,8 +385,9 @@ def read_line(self, line: str):
                 self._variable_done()
 
         elif self.state == ReaderState.IN_VTABLE:
-            if not is_blank_or_comment(line):
-                self._vtable_done()
+            vtable_class = get_class_name(line)
+            if vtable_class is not None:
+                self._vtable_done(class_name=vtable_class)
 
     def read_lines(self, lines: Iterable):
         for line in lines:
diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
index e0b25d85..a3e9360f 100644
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -63,3 +63,33 @@ def match_marker(line: str) -> DecompMarker | None:
 
 def is_marker_exact(line: str) -> bool:
     return markerExactRegex.match(line) is not None
+
+
+template_class_decl_regex = re.compile(
+    r"\s*(?:\/\/)?\s*class (\w+)<([\w]+)\s*(\*+)?\s*>"
+)
+
+
+class_decl_regex = re.compile(r"\s*(?:\/\/)?\s*class (\w+)")
+
+
+def get_class_name(line: str) -> str | None:
+    """For VTABLE markers, extract the class name from the code line or comment
+    where it appears."""
+
+    match = template_class_decl_regex.match(line)
+    if match is not None:
+        # For template classes, we should reformat the class name so it matches
+        # the output from cvdump: one space between the template type and any asterisks
+        # if it is a pointer type.
+        (class_name, template_type, asterisks) = match.groups()
+        if asterisks is not None:
+            return f"{class_name}<{template_type} {asterisks}>"
+
+        return f"{class_name}<{template_type}>"
+
+    match = class_decl_regex.match(line)
+    if match is not None:
+        return match.group(1)
+
+    return None
diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
index 7954aee1..30092a97 100644
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -193,6 +193,7 @@ def test_multiple_vtables(parser):
     )
     assert len(parser.alerts) == 0
     assert len(parser.vtables) == 2
+    assert parser.vtables[0].class_name == "MxString"
 
 
 def test_multiple_vtables_same_module(parser):
diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py
index 131a8ab3..5cbe5c8b 100644
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -5,6 +5,7 @@
     is_blank_or_comment,
     match_marker,
     is_marker_exact,
+    get_class_name,
 )
 
 
@@ -96,3 +97,36 @@ def test_marker_dict_type_replace():
     markers = list(d.iter())
     assert len(markers) == 1
     assert markers[0].type == "FUNCTION"
+
+
+class_name_match_cases = [
+    ("class MxString {", "MxString"),
+    ("// class MxString", "MxString"),
+    ("class MxString : public MxCore {", "MxString"),
+    ("class MxPtrList<MxPresenter>", "MxPtrList<MxPresenter>"),
+    # If it is possible to match the symbol MxList<LegoPathController *>::`vftable'
+    # we should get the correct class name if possible. If the template type is a pointer,
+    # the asterisk and class name are separated by one space.
+    ("// class MxList<LegoPathController *>", "MxList<LegoPathController *>"),
+    ("// class MxList<LegoPathController*>", "MxList<LegoPathController *>"),
+    ("// class MxList<LegoPathController* >", "MxList<LegoPathController *>"),
+    # I don't know if this would ever come up, but sure, why not?
+    ("// class MxList<LegoPathController**>", "MxList<LegoPathController **>"),
+]
+
+
+@pytest.mark.parametrize("line, class_name", class_name_match_cases)
+def test_get_class_name(line: str, class_name: str):
+    assert get_class_name(line) == class_name
+
+
+class_name_no_match_cases = [
+    "MxString { ",
+    "clas MxString",
+    "// MxPtrList<MxPresenter>::`scalar deleting destructor'",
+]
+
+
+@pytest.mark.parametrize("line", class_name_no_match_cases)
+def test_get_class_name_none(line: str):
+    assert get_class_name(line) is None

From cbc6105b5728379dc55d4de744f19ff7fe534583 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sat, 2 Dec 2023 18:57:23 -0500
Subject: [PATCH 16/17] Vtable marker should identify struct

---
 tools/isledecomp/isledecomp/parser/util.py | 4 ++--
 tools/isledecomp/tests/test_parser_util.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
index a3e9360f..99ab1c56 100644
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -66,11 +66,11 @@ def is_marker_exact(line: str) -> bool:
 
 
 template_class_decl_regex = re.compile(
-    r"\s*(?:\/\/)?\s*class (\w+)<([\w]+)\s*(\*+)?\s*>"
+    r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)<([\w]+)\s*(\*+)?\s*>"
 )
 
 
-class_decl_regex = re.compile(r"\s*(?:\/\/)?\s*class (\w+)")
+class_decl_regex = re.compile(r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)")
 
 
 def get_class_name(line: str) -> str | None:
diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py
index 5cbe5c8b..643abf3e 100644
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -100,6 +100,7 @@ def test_marker_dict_type_replace():
 
 
 class_name_match_cases = [
+    ("struct MxString {", "MxString"),
     ("class MxString {", "MxString"),
     ("// class MxString", "MxString"),
     ("class MxString : public MxCore {", "MxString"),

From dec876adcc9bb9fff78e45c4fc3b627d71b8322d Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sat, 2 Dec 2023 18:58:23 -0500
Subject: [PATCH 17/17] No colon for SIZE comment

---
 LEGO1/mxpalette.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LEGO1/mxpalette.cpp b/LEGO1/mxpalette.cpp
index aefe57c2..bc1df5ff 100644
--- a/LEGO1/mxpalette.cpp
+++ b/LEGO1/mxpalette.cpp
@@ -4,7 +4,7 @@
 #include "mxvideomanager.h"
 
 // GLOBAL: LEGO1 0x10102188
-// SIZE: 0x400
+// SIZE 0x400
 PALETTEENTRY g_defaultPaletteEntries[256] = {
 	{0u, 0u, 0u, 0u},       {128u, 0u, 0u, 0u},     {0u, 128u, 0u, 0u},     {128u, 128u, 0u, 0u},
 	{0u, 0u, 128u, 0u},     {128u, 0u, 128u, 0u},   {0u, 128u, 128u, 0u},   {128u, 128u, 128u, 0u},