From a1c6cb5dc93d8d5c5b53e7064dc15b2b7c060590 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Sun, 29 Oct 2023 18:15:37 -0400
Subject: [PATCH] Refactor checkorder into reusable isledecomp module

---
 .gitignore                                    |   1 +
 tools/checkorder/checkorder.py                | 130 +-----------------
 tools/checkorder/requirements.txt             |   1 +
 tools/isledecomp/.gitignore                   |   1 +
 tools/isledecomp/isledecomp/__init__.py       |   0
 tools/isledecomp/isledecomp/dir.py            |  17 +++
 .../isledecomp/isledecomp/parser/__init__.py  |   1 +
 tools/isledecomp/isledecomp/parser/parser.py  |  73 ++++++++++
 tools/isledecomp/isledecomp/parser/util.py    |  47 +++++++
 tools/isledecomp/setup.py                     |   9 ++
 tools/isledecomp/tests/__init__.py            |   0
 tools/isledecomp/tests/samples/basic_file.cpp |  22 +++
 .../tests/samples/missing_offset.cpp          |  16 +++
 .../tests/samples/oneline_function.cpp        |  12 ++
 .../isledecomp/tests/samples/out_of_order.cpp |  20 +++
 tools/isledecomp/tests/test_parser.py         |  72 ++++++++++
 tools/isledecomp/tests/test_parser_util.py    |  87 ++++++++++++
 17 files changed, 386 insertions(+), 123 deletions(-)
 create mode 100644 tools/checkorder/requirements.txt
 create mode 100644 tools/isledecomp/.gitignore
 create mode 100644 tools/isledecomp/isledecomp/__init__.py
 create mode 100644 tools/isledecomp/isledecomp/dir.py
 create mode 100644 tools/isledecomp/isledecomp/parser/__init__.py
 create mode 100644 tools/isledecomp/isledecomp/parser/parser.py
 create mode 100644 tools/isledecomp/isledecomp/parser/util.py
 create mode 100644 tools/isledecomp/setup.py
 create mode 100644 tools/isledecomp/tests/__init__.py
 create mode 100644 tools/isledecomp/tests/samples/basic_file.cpp
 create mode 100644 tools/isledecomp/tests/samples/missing_offset.cpp
 create mode 100644 tools/isledecomp/tests/samples/oneline_function.cpp
 create mode 100644 tools/isledecomp/tests/samples/out_of_order.cpp
 create mode 100644 tools/isledecomp/tests/test_parser.py
 create mode 100644 tools/isledecomp/tests/test_parser_util.py

diff --git a/.gitignore b/.gitignore
index 8768f944..2fe8f4e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ ISLE.EXE
 LEGO1.DLL
 build/
 *.swp
+*.pyc
\ No newline at end of file
diff --git a/tools/checkorder/checkorder.py b/tools/checkorder/checkorder.py
index 8ff8f57e..e1172290 100644
--- a/tools/checkorder/checkorder.py
+++ b/tools/checkorder/checkorder.py
@@ -1,129 +1,13 @@
 import os
-import re
 import sys
 import argparse
-from typing import List, Iterator, TextIO
-from collections import namedtuple
-from enum import Enum
-
-
-class ReaderState(Enum):
-    WANT_OFFSET = 0
-    WANT_SIG = 1
-    IN_FUNC = 2
-
-
-CodeBlock = namedtuple('CodeBlock',
-                       ['offset', 'signature', 'start_line', 'end_line'])
-
-# To match a reasonable variance of formatting for the offset comment
-offsetCommentRegex = re.compile(r'//\s?OFFSET:\s?\w+ (?:0x)?([a-f0-9]+)',
-                                flags=re.I)
-
-# To match the exact syntax (text upper case, hex lower case, with spaces)
-# that is used in most places
-offsetCommentExactRegex = re.compile(r'// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)')
-
-
-def is_blank_or_comment(line: str) -> bool:
-    """Helper to read ahead adter the offset comment is matched.
-       There could be blank lines or other comments before the
-       function signature, and we want to skip those."""
-    line_strip = line.strip()
-    return (len(line_strip) == 0
-            or line_strip.startswith('//')
-            or line_strip.startswith('/*')
-            or line_strip.endswith('*/'))
-
-
-def is_exact_offset_comment(line: str) -> bool:
-    """If the offset comment does not match our (unofficial) syntax
-       we may want to alert the user to fix it for style points."""
-    return offsetCommentExactRegex.match(line) is not None
-
-
-def match_offset_comment(line: str) -> str | None:
-    # TODO: intended to skip the expensive regex match, but is it necessary?
-    if not line.startswith('//'):
-        return None
-
-    match = offsetCommentRegex.match(line)
-    return match.group(1) if match is not None else None
-
-
-def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
-    """Read the IO stream (file) line-by-line and give the following report:
-       Foreach code block (function) in the file, what are its starting and
-       ending line numbers, and what is the given offset in the original
-       binary. We expect the result to be ordered by line number because we
-       are reading the file from start to finish."""
-
-    blocks = []
-
-    offset = None
-    function_sig = None
-    start_line = None
-    state = ReaderState.WANT_OFFSET
-
-    for line_no, line in enumerate(stream):
-        if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
-            # Naive but reasonable assumption that functions will end with
-            # a curly brace on its own line with no prepended spaces.
-            if line.startswith('}'):
-                # TODO: could streamline this and the next case
-                block = CodeBlock(offset=offset,
-                                  signature=function_sig,
-                                  start_line=start_line,
-                                  end_line=line_no)
-
-                blocks.append(block)
-                state = ReaderState.WANT_OFFSET
-            elif match_offset_comment(line) is not None:
-                # We hit another offset unexpectedly before detecting the
-                # end of the function. We can recover easily by just
-                # ending the function here.
-                block = CodeBlock(offset=offset,
-                                  signature=function_sig,
-                                  start_line=start_line,
-                                  end_line=line_no - 1)
-
-                blocks.append(block)
-                state = ReaderState.WANT_OFFSET
-
-            # We want to grab the function signature so we can identify
-            # the code block. Skip any blank lines or comments
-            # that follow the offset comment.
-            elif (not is_blank_or_comment(line)
-                  and state == ReaderState.WANT_SIG):
-                function_sig = line.strip()
-                state = ReaderState.IN_FUNC
-
-        if state == ReaderState.WANT_OFFSET:
-            match = match_offset_comment(line)
-            if match is not None:
-                offset = int(match, 16)
-                start_line = line_no
-                state = ReaderState.WANT_SIG
-
-    return blocks
-
-
-def file_is_cpp(filename: str) -> bool:
-    # TODO: expand to check header files also?
-    (basefile, ext) = os.path.splitext(filename)
-    return ext.lower() == '.cpp'
-
-
-def walk_source_dir(source: str) -> Iterator[tuple]:
-    """Generator to walk the given directory recursively and return
-       any .cpp files found."""
-
-    for subdir, dirs, files in os.walk(source):
-        for file in files:
-            if not file_is_cpp(file):
-                continue
-
-            yield os.path.join(subdir, file)
+from typing import TextIO
+from isledecomp.dir import walk_source_dir
+from isledecomp.parser import find_code_blocks
+from isledecomp.parser.util import (
+    match_offset_comment,
+    is_exact_offset_comment
+)
 
 
 def sig_truncate(sig: str) -> str:
diff --git a/tools/checkorder/requirements.txt b/tools/checkorder/requirements.txt
new file mode 100644
index 00000000..d6261da4
--- /dev/null
+++ b/tools/checkorder/requirements.txt
@@ -0,0 +1 @@
+-e ../isledecomp/
\ No newline at end of file
diff --git a/tools/isledecomp/.gitignore b/tools/isledecomp/.gitignore
new file mode 100644
index 00000000..00aa29cb
--- /dev/null
+++ b/tools/isledecomp/.gitignore
@@ -0,0 +1 @@
+isledecomp.egg-info/
\ No newline at end of file
diff --git a/tools/isledecomp/isledecomp/__init__.py b/tools/isledecomp/isledecomp/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/isledecomp/isledecomp/dir.py b/tools/isledecomp/isledecomp/dir.py
new file mode 100644
index 00000000..dce85640
--- /dev/null
+++ b/tools/isledecomp/isledecomp/dir.py
@@ -0,0 +1,17 @@
+import os
+from typing import Iterator
+
+
+def file_is_cpp(filename: str) -> bool:
+    (basefile, ext) = os.path.splitext(filename)
+    return ext.lower() in ('.h', '.cpp')
+
+
+def walk_source_dir(source: str) -> Iterator[str]:
+    """Generator to walk the given directory recursively and return
+       any C++ files found."""
+
+    for subdir, dirs, files in os.walk(source):
+        for file in files:
+            if file_is_cpp(file):
+                yield os.path.join(subdir, file)
diff --git a/tools/isledecomp/isledecomp/parser/__init__.py b/tools/isledecomp/isledecomp/parser/__init__.py
new file mode 100644
index 00000000..0d504619
--- /dev/null
+++ b/tools/isledecomp/isledecomp/parser/__init__.py
@@ -0,0 +1 @@
+from .parser import find_code_blocks
diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py
new file mode 100644
index 00000000..1a6421de
--- /dev/null
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -0,0 +1,73 @@
+# C++ file parser
+
+from typing import List, TextIO
+from enum import Enum
+from .util import (
+    CodeBlock,
+    is_blank_or_comment,
+    match_offset_comment,
+    is_exact_offset_comment,
+)
+
+
+class ReaderState(Enum):
+    WANT_OFFSET = 0
+    WANT_SIG = 1
+    IN_FUNC = 2
+
+
+def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
+    """Read the IO stream (file) line-by-line and give the following report:
+       Foreach code block (function) in the file, what are its starting and
+       ending line numbers, and what is the given offset in the original
+       binary. We expect the result to be ordered by line number because we
+       are reading the file from start to finish."""
+
+    blocks = []
+
+    offset = None
+    function_sig = None
+    start_line = None
+    state = ReaderState.WANT_OFFSET
+
+    for line_no, line in enumerate(stream):
+        if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
+            # Naive but reasonable assumption that functions will end with
+            # a curly brace on its own line with no prepended spaces.
+            if line.startswith('}'):
+                # TODO: could streamline this and the next case
+                block = CodeBlock(offset=offset,
+                                  signature=function_sig,
+                                  start_line=start_line,
+                                  end_line=line_no)
+
+                blocks.append(block)
+                state = ReaderState.WANT_OFFSET
+            elif match_offset_comment(line) is not None:
+                # We hit another offset unexpectedly before detecting the
+                # end of the function. We can recover easily by just
+                # ending the function here.
+                block = CodeBlock(offset=offset,
+                                  signature=function_sig,
+                                  start_line=start_line,
+                                  end_line=line_no - 1)
+
+                blocks.append(block)
+                state = ReaderState.WANT_OFFSET
+
+            # We want to grab the function signature so we can identify
+            # the code block. Skip any blank lines or comments
+            # that follow the offset comment.
+            elif (not is_blank_or_comment(line)
+                  and state == ReaderState.WANT_SIG):
+                function_sig = line.strip()
+                state = ReaderState.IN_FUNC
+
+        if state == ReaderState.WANT_OFFSET:
+            match = match_offset_comment(line)
+            if match is not None:
+                offset = int(match, 16)
+                start_line = line_no
+                state = ReaderState.WANT_SIG
+
+    return blocks
diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py
new file mode 100644
index 00000000..d93cb013
--- /dev/null
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -0,0 +1,47 @@
+# C++ Parser utility functions and data structures
+import re
+from collections import namedtuple
+
+
+CodeBlock = namedtuple('CodeBlock',
+                       ['offset', 'signature', 'start_line', 'end_line'])
+
+
+FunctionOffset = namedtuple('FunctionOffset',
+                            ['raw', 'address', 'is_stub'])
+
+
+# To match a reasonable variance of formatting for the offset comment
+offsetCommentRegex = re.compile(r'//\s*OFFSET:\s*\w+\s+(?:0x)?([a-f0-9]+)',
+                                flags=re.I)
+
+# To match the exact syntax (text upper case, hex lower case, with spaces)
+# that is used in most places
+offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)(?: STUB)?$')
+
+
+def is_blank_or_comment(line: str) -> bool:
+    """Helper to read ahead after the offset comment is matched.
+       There could be blank lines or other comments before the
+       function signature, and we want to skip those."""
+    line_strip = line.strip()
+    return (len(line_strip) == 0
+            or line_strip.startswith('//')
+            or line_strip.startswith('/*')
+            or line_strip.endswith('*/'))
+
+
+def is_exact_offset_comment(line: str) -> bool:
+    """If the offset comment does not match our (unofficial) syntax
+       we may want to alert the user to fix it for style points."""
+    return offsetCommentExactRegex.match(line) is not None
+
+
+def match_offset_comment(line: str) -> str | None:
+    # TODO: intended to skip the expensive regex match, but is it necessary?
+    # TODO: this will skip indented offsets
+    if not line.startswith('//'):
+        return None
+
+    match = offsetCommentRegex.match(line)
+    return match.group(1) if match is not None else None
diff --git a/tools/isledecomp/setup.py b/tools/isledecomp/setup.py
new file mode 100644
index 00000000..897b802e
--- /dev/null
+++ b/tools/isledecomp/setup.py
@@ -0,0 +1,9 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='isledecomp',
+    version='0.1.0',
+    description='Python tools for the isledecomp project',
+    packages=find_packages(),
+    tests_require=['pytest'],
+)
diff --git a/tools/isledecomp/tests/__init__.py b/tools/isledecomp/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/isledecomp/tests/samples/basic_file.cpp b/tools/isledecomp/tests/samples/basic_file.cpp
new file mode 100644
index 00000000..e2dd8b9e
--- /dev/null
+++ b/tools/isledecomp/tests/samples/basic_file.cpp
@@ -0,0 +1,22 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// A very simple well-formed code file
+
+// OFFSET: LEGO1 0x1234
+void function01()
+{
+  // TODO
+}
+
+// OFFSET: LEGO1 0x2345
+void function02()
+{
+  // TODO
+}
+
+// OFFSET: LEGO1 0x3456
+void function03()
+{
+  // TODO
+}
diff --git a/tools/isledecomp/tests/samples/missing_offset.cpp b/tools/isledecomp/tests/samples/missing_offset.cpp
new file mode 100644
index 00000000..953c2476
--- /dev/null
+++ b/tools/isledecomp/tests/samples/missing_offset.cpp
@@ -0,0 +1,16 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+#include <stdio.h>
+
+int no_offset_comment()
+{
+  static int dummy = 123;
+  return -1;
+}
+
+// OFFSET: LEGO1 0xdeadbeef
+void regular_ole_function()
+{
+  printf("hi there");
+}
diff --git a/tools/isledecomp/tests/samples/oneline_function.cpp b/tools/isledecomp/tests/samples/oneline_function.cpp
new file mode 100644
index 00000000..80b2c3ff
--- /dev/null
+++ b/tools/isledecomp/tests/samples/oneline_function.cpp
@@ -0,0 +1,12 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// OFFSET: LEGO1 0x1234
+void short_function() { static char* msg = "oneliner"; }
+
+// OFFSET: LEGO1 0x5555
+void function_after_one_liner()
+{
+  // This function comes after the previous that is on a single line.
+  // Do we report the offset for this one correctly?
+}
diff --git a/tools/isledecomp/tests/samples/out_of_order.cpp b/tools/isledecomp/tests/samples/out_of_order.cpp
new file mode 100644
index 00000000..3808a972
--- /dev/null
+++ b/tools/isledecomp/tests/samples/out_of_order.cpp
@@ -0,0 +1,20 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// OFFSET: LEGO1 0x1001
+void function_order01()
+{
+    // TODO
+}
+
+// OFFSET: LEGO1 0x1003
+void function_order03()
+{
+    // TODO
+}
+
+// OFFSET: LEGO1 0x1002
+void function_order02()
+{
+    // TODO
+}
diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py
new file mode 100644
index 00000000..e76553fc
--- /dev/null
+++ b/tools/isledecomp/tests/test_parser.py
@@ -0,0 +1,72 @@
+import os
+import pytest
+from typing import List, TextIO
+from isledecomp.parser import find_code_blocks
+from isledecomp.parser.util import CodeBlock
+
+SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples')
+
+
+def sample_file(filename: str) -> TextIO:
+    """Wrapper for opening the samples from the directory that does not
+       depend on the cwd where we run the test"""
+    full_path = os.path.join(SAMPLE_DIR, filename)
+    return open(full_path, 'r')
+
+
+def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
+    """Helper to make this more idiomatic"""
+    just_offsets = [block.offset for block in blocks]
+    return just_offsets == sorted(just_offsets)
+
+
+# Tests are below #
+
+
+def test_sanity():
+    """Read a very basic file"""
+    with sample_file('basic_file.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 3
+    assert code_blocks_are_sorted(blocks) is True
+    # n.b. The parser returns line numbers as 0-based
+    assert blocks[0].start_line == 5
+    assert blocks[0].end_line == 9
+
+
+def test_oneline():
+    """(Assuming clang-format permits this) This sample has a function
+    on a single line. This will test the end-of-function detection"""
+    with sample_file('oneline_function.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 2
+    assert blocks[0].start_line == 3
+    # TODO: Because of the way it works now, this captures the blank line
+    # as part of the function. That's not *incorrect* per se, but
+    # this needs to be more consistent if we want the tool to sort the
+    # code blocks in the file.
+    assert blocks[0].end_line == 5
+
+
+def test_missing_offset():
+    """What if the function doesn't have an offset comment?"""
+    with sample_file('missing_offset.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    # TODO: For now, the function without the offset will just be ignored.
+    # Would be the same outcome if the comment was present but mangled and
+    # we failed to match it. We should detect these cases in the future.
+    assert len(blocks) == 1
+
+
+def test_jumbled_case():
+    """The parser just reports what it sees. It is the responsibility of
+       the downstream tools to do something about a jumbled file.
+       Just verify that we are reading it correctly."""
+    with sample_file('out_of_order.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 3
+    assert code_blocks_are_sorted(blocks) is False
diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py
new file mode 100644
index 00000000..33a98420
--- /dev/null
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -0,0 +1,87 @@
+import pytest
+from isledecomp.parser.util import (
+    is_blank_or_comment,
+    match_offset_comment,
+    is_exact_offset_comment,
+)
+
+
+blank_or_comment_param = [
+    (True,  ''),
+    (True,  '\t'),
+    (True,  '    '),
+    (False, '\tint abc=123;'),
+    (True,  '// OFFSET: LEGO1 0xdeadbeef'),
+    (True,  '   /* Block comment beginning'),
+    (True,  'Block comment ending */   '),
+
+    # TODO: does clang-format have anything to say about these cases?
+    (False, 'x++; // Comment folows'),
+    (False, 'x++; /* Block comment begins'),
+]
+
+
+@pytest.mark.parametrize('expected, line', blank_or_comment_param)
+def test_is_blank_or_comment(line: str, expected: bool):
+    assert is_blank_or_comment(line) is expected
+
+
+offset_comment_samples = [
+    # (can_parse: bool, exact_match: bool, line: str)
+    # Should match both expected modules with optional STUB marker
+    (True,  True,  '// OFFSET: LEGO1 0xdeadbeef'),
+    (True,  True,  '// OFFSET: LEGO1 0xdeadbeef STUB'),
+    (True,  True,  '// OFFSET: ISLE 0x12345678'),
+    (True,  True,  '// OFFSET: ISLE 0x12345678 STUB'),
+
+    # No trailing spaces allowed
+    (True,  False, '// OFFSET: LEGO1 0xdeadbeef  '),
+    (True,  False, '// OFFSET: LEGO1 0xdeadbeef STUB '),
+
+    # Must have exactly one space between elements
+    (True,  False, '//OFFSET: ISLE 0xdeadbeef'),
+    (True,  False, '// OFFSET:ISLE 0xdeadbeef'),
+    (True,  False, '//  OFFSET: ISLE 0xdeadbeef'),
+    (True,  False, '// OFFSET:  ISLE 0xdeadbeef'),
+    (True,  False, '// OFFSET: ISLE  0xdeadbeef'),
+    (True,  False, '// OFFSET: ISLE 0xdeadbeef  STUB'),
+
+    # Must have 0x prefix for hex number
+    (True,  False, '// OFFSET: ISLE deadbeef'),
+
+    # Offset, module name, and STUB must be uppercase
+    (True,  False, '// offset: ISLE 0xdeadbeef'),
+    (True,  False, '// offset: isle 0xdeadbeef'),
+    (True,  False, '// OFFSET: LEGO1 0xdeadbeef stub'),
+
+    # Hex string must be lowercase
+    (True,  False, '// OFFSET: ISLE 0xDEADBEEF'),
+
+    # TODO: How flexible should we be with matching the module name?
+    (True,  True,  '// OFFSET: OMNI 0x12345678'),
+    (True,  True,  '// OFFSET: LEG01 0x12345678'),
+    (True,  False,  '// OFFSET: hello 0x12345678'),
+
+    # Not close enough to match
+    (False, False, '// OFFSET: ISLE0x12345678'),
+    (False, False, '// OFFSET: 0x12345678'),
+    (False, False, '// LEGO1: 0x12345678'),
+
+    # Hex string shorter than 8 characters
+    (True,  True,  '// OFFSET: LEGO1 0x1234'),
+
+    # TODO: These match but shouldn't.
+    # (False, False, '// OFFSET: LEGO1 0'),
+    # (False, False, '// OFFSET: LEGO1 0x'),
+]
+
+
+@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
+def test_offset_match(line: str, match: bool, exact):
+    did_match = match_offset_comment(line) is not None
+    assert did_match is match
+
+
+@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
+def test_exact_offset_comment(line: str, exact: bool, match):
+    assert is_exact_offset_comment(line) is exact