From a1c6cb5dc93d8d5c5b53e7064dc15b2b7c060590 Mon Sep 17 00:00:00 2001 From: disinvite Date: Sun, 29 Oct 2023 18:15:37 -0400 Subject: [PATCH] Refactor checkorder into reusable isledecomp module --- .gitignore | 1 + tools/checkorder/checkorder.py | 130 +----------------- tools/checkorder/requirements.txt | 1 + tools/isledecomp/.gitignore | 1 + tools/isledecomp/isledecomp/__init__.py | 0 tools/isledecomp/isledecomp/dir.py | 17 +++ .../isledecomp/isledecomp/parser/__init__.py | 1 + tools/isledecomp/isledecomp/parser/parser.py | 73 ++++++++++ tools/isledecomp/isledecomp/parser/util.py | 47 +++++++ tools/isledecomp/setup.py | 9 ++ tools/isledecomp/tests/__init__.py | 0 tools/isledecomp/tests/samples/basic_file.cpp | 22 +++ .../tests/samples/missing_offset.cpp | 16 +++ .../tests/samples/oneline_function.cpp | 12 ++ .../isledecomp/tests/samples/out_of_order.cpp | 20 +++ tools/isledecomp/tests/test_parser.py | 72 ++++++++++ tools/isledecomp/tests/test_parser_util.py | 87 ++++++++++++ 17 files changed, 386 insertions(+), 123 deletions(-) create mode 100644 tools/checkorder/requirements.txt create mode 100644 tools/isledecomp/.gitignore create mode 100644 tools/isledecomp/isledecomp/__init__.py create mode 100644 tools/isledecomp/isledecomp/dir.py create mode 100644 tools/isledecomp/isledecomp/parser/__init__.py create mode 100644 tools/isledecomp/isledecomp/parser/parser.py create mode 100644 tools/isledecomp/isledecomp/parser/util.py create mode 100644 tools/isledecomp/setup.py create mode 100644 tools/isledecomp/tests/__init__.py create mode 100644 tools/isledecomp/tests/samples/basic_file.cpp create mode 100644 tools/isledecomp/tests/samples/missing_offset.cpp create mode 100644 tools/isledecomp/tests/samples/oneline_function.cpp create mode 100644 tools/isledecomp/tests/samples/out_of_order.cpp create mode 100644 tools/isledecomp/tests/test_parser.py create mode 100644 tools/isledecomp/tests/test_parser_util.py diff --git a/.gitignore b/.gitignore index 8768f944..2fe8f4e3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ ISLE.EXE LEGO1.DLL build/ *.swp +*.pyc \ No newline at end of file diff --git a/tools/checkorder/checkorder.py b/tools/checkorder/checkorder.py index 8ff8f57e..e1172290 100644 --- a/tools/checkorder/checkorder.py +++ b/tools/checkorder/checkorder.py @@ -1,129 +1,13 @@ import os -import re import sys import argparse -from typing import List, Iterator, TextIO -from collections import namedtuple -from enum import Enum - - -class ReaderState(Enum): - WANT_OFFSET = 0 - WANT_SIG = 1 - IN_FUNC = 2 - - -CodeBlock = namedtuple('CodeBlock', - ['offset', 'signature', 'start_line', 'end_line']) - -# To match a reasonable variance of formatting for the offset comment -offsetCommentRegex = re.compile(r'//\s?OFFSET:\s?\w+ (?:0x)?([a-f0-9]+)', - flags=re.I) - -# To match the exact syntax (text upper case, hex lower case, with spaces) -# that is used in most places -offsetCommentExactRegex = re.compile(r'// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)') - - -def is_blank_or_comment(line: str) -> bool: - """Helper to read ahead adter the offset comment is matched. - There could be blank lines or other comments before the - function signature, and we want to skip those.""" - line_strip = line.strip() - return (len(line_strip) == 0 - or line_strip.startswith('//') - or line_strip.startswith('/*') - or line_strip.endswith('*/')) - - -def is_exact_offset_comment(line: str) -> bool: - """If the offset comment does not match our (unofficial) syntax - we may want to alert the user to fix it for style points.""" - return offsetCommentExactRegex.match(line) is not None - - -def match_offset_comment(line: str) -> str | None: - # TODO: intended to skip the expensive regex match, but is it necessary? - if not line.startswith('//'): - return None - - match = offsetCommentRegex.match(line) - return match.group(1) if match is not None else None - - -def find_code_blocks(stream: TextIO) -> List[CodeBlock]: - """Read the IO stream (file) line-by-line and give the following report: - Foreach code block (function) in the file, what are its starting and - ending line numbers, and what is the given offset in the original - binary. We expect the result to be ordered by line number because we - are reading the file from start to finish.""" - - blocks = [] - - offset = None - function_sig = None - start_line = None - state = ReaderState.WANT_OFFSET - - for line_no, line in enumerate(stream): - if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC): - # Naive but reasonable assumption that functions will end with - # a curly brace on its own line with no prepended spaces. - if line.startswith('}'): - # TODO: could streamline this and the next case - block = CodeBlock(offset=offset, - signature=function_sig, - start_line=start_line, - end_line=line_no) - - blocks.append(block) - state = ReaderState.WANT_OFFSET - elif match_offset_comment(line) is not None: - # We hit another offset unexpectedly before detecting the - # end of the function. We can recover easily by just - # ending the function here. - block = CodeBlock(offset=offset, - signature=function_sig, - start_line=start_line, - end_line=line_no - 1) - - blocks.append(block) - state = ReaderState.WANT_OFFSET - - # We want to grab the function signature so we can identify - # the code block. Skip any blank lines or comments - # that follow the offset comment. - elif (not is_blank_or_comment(line) - and state == ReaderState.WANT_SIG): - function_sig = line.strip() - state = ReaderState.IN_FUNC - - if state == ReaderState.WANT_OFFSET: - match = match_offset_comment(line) - if match is not None: - offset = int(match, 16) - start_line = line_no - state = ReaderState.WANT_SIG - - return blocks - - -def file_is_cpp(filename: str) -> bool: - # TODO: expand to check header files also? - (basefile, ext) = os.path.splitext(filename) - return ext.lower() == '.cpp' - - -def walk_source_dir(source: str) -> Iterator[tuple]: - """Generator to walk the given directory recursively and return - any .cpp files found.""" - - for subdir, dirs, files in os.walk(source): - for file in files: - if not file_is_cpp(file): - continue - - yield os.path.join(subdir, file) +from typing import TextIO +from isledecomp.dir import walk_source_dir +from isledecomp.parser import find_code_blocks +from isledecomp.parser.util import ( + match_offset_comment, + is_exact_offset_comment +) def sig_truncate(sig: str) -> str: diff --git a/tools/checkorder/requirements.txt b/tools/checkorder/requirements.txt new file mode 100644 index 00000000..d6261da4 --- /dev/null +++ b/tools/checkorder/requirements.txt @@ -0,0 +1 @@ +-e ../isledecomp/ \ No newline at end of file diff --git a/tools/isledecomp/.gitignore b/tools/isledecomp/.gitignore new file mode 100644 index 00000000..00aa29cb --- /dev/null +++ b/tools/isledecomp/.gitignore @@ -0,0 +1 @@ +isledecomp.egg-info/ \ No newline at end of file diff --git a/tools/isledecomp/isledecomp/__init__.py b/tools/isledecomp/isledecomp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/isledecomp/isledecomp/dir.py b/tools/isledecomp/isledecomp/dir.py new file mode 100644 index 00000000..dce85640 --- /dev/null +++ b/tools/isledecomp/isledecomp/dir.py @@ -0,0 +1,17 @@ +import os +from typing import Iterator + + +def file_is_cpp(filename: str) -> bool: + (basefile, ext) = os.path.splitext(filename) + return ext.lower() in ('.h', '.cpp') + + +def walk_source_dir(source: str) -> Iterator[str]: + """Generator to walk the given directory recursively and return + any C++ files found.""" + + for subdir, dirs, files in os.walk(source): + for file in files: + if file_is_cpp(file): + yield os.path.join(subdir, file) diff --git a/tools/isledecomp/isledecomp/parser/__init__.py b/tools/isledecomp/isledecomp/parser/__init__.py new file mode 100644 index 00000000..0d504619 --- /dev/null +++ b/tools/isledecomp/isledecomp/parser/__init__.py @@ -0,0 +1 @@ +from .parser import find_code_blocks diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py new file mode 100644 index 00000000..1a6421de --- /dev/null +++ b/tools/isledecomp/isledecomp/parser/parser.py @@ -0,0 +1,73 @@ +# C++ file parser + +from typing import List, TextIO +from enum import Enum +from .util import ( + CodeBlock, + is_blank_or_comment, + match_offset_comment, + is_exact_offset_comment, +) + + +class ReaderState(Enum): + WANT_OFFSET = 0 + WANT_SIG = 1 + IN_FUNC = 2 + + +def find_code_blocks(stream: TextIO) -> List[CodeBlock]: + """Read the IO stream (file) line-by-line and give the following report: + Foreach code block (function) in the file, what are its starting and + ending line numbers, and what is the given offset in the original + binary. We expect the result to be ordered by line number because we + are reading the file from start to finish.""" + + blocks = [] + + offset = None + function_sig = None + start_line = None + state = ReaderState.WANT_OFFSET + + for line_no, line in enumerate(stream): + if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC): + # Naive but reasonable assumption that functions will end with + # a curly brace on its own line with no prepended spaces. + if line.startswith('}'): + # TODO: could streamline this and the next case + block = CodeBlock(offset=offset, + signature=function_sig, + start_line=start_line, + end_line=line_no) + + blocks.append(block) + state = ReaderState.WANT_OFFSET + elif match_offset_comment(line) is not None: + # We hit another offset unexpectedly before detecting the + # end of the function. We can recover easily by just + # ending the function here. + block = CodeBlock(offset=offset, + signature=function_sig, + start_line=start_line, + end_line=line_no - 1) + + blocks.append(block) + state = ReaderState.WANT_OFFSET + + # We want to grab the function signature so we can identify + # the code block. Skip any blank lines or comments + # that follow the offset comment. + elif (not is_blank_or_comment(line) + and state == ReaderState.WANT_SIG): + function_sig = line.strip() + state = ReaderState.IN_FUNC + + if state == ReaderState.WANT_OFFSET: + match = match_offset_comment(line) + if match is not None: + offset = int(match, 16) + start_line = line_no + state = ReaderState.WANT_SIG + + return blocks diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py new file mode 100644 index 00000000..d93cb013 --- /dev/null +++ b/tools/isledecomp/isledecomp/parser/util.py @@ -0,0 +1,47 @@ +# C++ Parser utility functions and data structures +import re +from collections import namedtuple + + +CodeBlock = namedtuple('CodeBlock', + ['offset', 'signature', 'start_line', 'end_line']) + + +FunctionOffset = namedtuple('FunctionOffset', + ['raw', 'address', 'is_stub']) + + +# To match a reasonable variance of formatting for the offset comment +offsetCommentRegex = re.compile(r'//\s*OFFSET:\s*\w+\s+(?:0x)?([a-f0-9]+)', + flags=re.I) + +# To match the exact syntax (text upper case, hex lower case, with spaces) +# that is used in most places +offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)(?: STUB)?$') + + +def is_blank_or_comment(line: str) -> bool: + """Helper to read ahead after the offset comment is matched. + There could be blank lines or other comments before the + function signature, and we want to skip those.""" + line_strip = line.strip() + return (len(line_strip) == 0 + or line_strip.startswith('//') + or line_strip.startswith('/*') + or line_strip.endswith('*/')) + + +def is_exact_offset_comment(line: str) -> bool: + """If the offset comment does not match our (unofficial) syntax + we may want to alert the user to fix it for style points.""" + return offsetCommentExactRegex.match(line) is not None + + +def match_offset_comment(line: str) -> str | None: + # TODO: intended to skip the expensive regex match, but is it necessary? + # TODO: this will skip indented offsets + if not line.startswith('//'): + return None + + match = offsetCommentRegex.match(line) + return match.group(1) if match is not None else None diff --git a/tools/isledecomp/setup.py b/tools/isledecomp/setup.py new file mode 100644 index 00000000..897b802e --- /dev/null +++ b/tools/isledecomp/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup, find_packages + +setup( + name='isledecomp', + version='0.1.0', + description='Python tools for the isledecomp project', + packages=find_packages(), + tests_require=['pytest'], +) diff --git a/tools/isledecomp/tests/__init__.py b/tools/isledecomp/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/isledecomp/tests/samples/basic_file.cpp b/tools/isledecomp/tests/samples/basic_file.cpp new file mode 100644 index 00000000..e2dd8b9e --- /dev/null +++ b/tools/isledecomp/tests/samples/basic_file.cpp @@ -0,0 +1,22 @@ +// Sample for python unit tests +// Not part of the decomp + +// A very simple well-formed code file + +// OFFSET: LEGO1 0x1234 +void function01() +{ + // TODO +} + +// OFFSET: LEGO1 0x2345 +void function02() +{ + // TODO +} + +// OFFSET: LEGO1 0x3456 +void function03() +{ + // TODO +} diff --git a/tools/isledecomp/tests/samples/missing_offset.cpp b/tools/isledecomp/tests/samples/missing_offset.cpp new file mode 100644 index 00000000..953c2476 --- /dev/null +++ b/tools/isledecomp/tests/samples/missing_offset.cpp @@ -0,0 +1,16 @@ +// Sample for python unit tests +// Not part of the decomp + +#include + +int no_offset_comment() +{ + static int dummy = 123; + return -1; +} + +// OFFSET: LEGO1 0xdeadbeef +void regular_ole_function() +{ + printf("hi there"); +} diff --git a/tools/isledecomp/tests/samples/oneline_function.cpp b/tools/isledecomp/tests/samples/oneline_function.cpp new file mode 100644 index 00000000..80b2c3ff --- /dev/null +++ b/tools/isledecomp/tests/samples/oneline_function.cpp @@ -0,0 +1,12 @@ +// Sample for python unit tests +// Not part of the decomp + +// OFFSET: LEGO1 0x1234 +void short_function() { static char* msg = "oneliner"; } + +// OFFSET: LEGO1 0x5555 +void function_after_one_liner() +{ + // This function comes after the previous that is on a single line. + // Do we report the offset for this one correctly? +} diff --git a/tools/isledecomp/tests/samples/out_of_order.cpp b/tools/isledecomp/tests/samples/out_of_order.cpp new file mode 100644 index 00000000..3808a972 --- /dev/null +++ b/tools/isledecomp/tests/samples/out_of_order.cpp @@ -0,0 +1,20 @@ +// Sample for python unit tests +// Not part of the decomp + +// OFFSET: LEGO1 0x1001 +void function_order01() +{ + // TODO +} + +// OFFSET: LEGO1 0x1003 +void function_order03() +{ + // TODO +} + +// OFFSET: LEGO1 0x1002 +void function_order02() +{ + // TODO +} diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py new file mode 100644 index 00000000..e76553fc --- /dev/null +++ b/tools/isledecomp/tests/test_parser.py @@ -0,0 +1,72 @@ +import os +import pytest +from typing import List, TextIO +from isledecomp.parser import find_code_blocks +from isledecomp.parser.util import CodeBlock + +SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples') + + +def sample_file(filename: str) -> TextIO: + """Wrapper for opening the samples from the directory that does not + depend on the cwd where we run the test""" + full_path = os.path.join(SAMPLE_DIR, filename) + return open(full_path, 'r') + + +def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool: + """Helper to make this more idiomatic""" + just_offsets = [block.offset for block in blocks] + return just_offsets == sorted(just_offsets) + + +# Tests are below # + + +def test_sanity(): + """Read a very basic file""" + with sample_file('basic_file.cpp') as f: + blocks = find_code_blocks(f) + + assert len(blocks) == 3 + assert code_blocks_are_sorted(blocks) is True + # n.b. The parser returns line numbers as 0-based + assert blocks[0].start_line == 5 + assert blocks[0].end_line == 9 + + +def test_oneline(): + """(Assuming clang-format permits this) This sample has a function + on a single line. This will test the end-of-function detection""" + with sample_file('oneline_function.cpp') as f: + blocks = find_code_blocks(f) + + assert len(blocks) == 2 + assert blocks[0].start_line == 3 + # TODO: Because of the way it works now, this captures the blank line + # as part of the function. That's not *incorrect* per se, but + # this needs to be more consistent if we want the tool to sort the + # code blocks in the file. + assert blocks[0].end_line == 5 + + +def test_missing_offset(): + """What if the function doesn't have an offset comment?""" + with sample_file('missing_offset.cpp') as f: + blocks = find_code_blocks(f) + + # TODO: For now, the function without the offset will just be ignored. + # Would be the same outcome if the comment was present but mangled and + # we failed to match it. We should detect these cases in the future. + assert len(blocks) == 1 + + +def test_jumbled_case(): + """The parser just reports what it sees. It is the responsibility of + the downstream tools to do something about a jumbled file. + Just verify that we are reading it correctly.""" + with sample_file('out_of_order.cpp') as f: + blocks = find_code_blocks(f) + + assert len(blocks) == 3 + assert code_blocks_are_sorted(blocks) is False diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py new file mode 100644 index 00000000..33a98420 --- /dev/null +++ b/tools/isledecomp/tests/test_parser_util.py @@ -0,0 +1,87 @@ +import pytest +from isledecomp.parser.util import ( + is_blank_or_comment, + match_offset_comment, + is_exact_offset_comment, +) + + +blank_or_comment_param = [ + (True, ''), + (True, '\t'), + (True, ' '), + (False, '\tint abc=123;'), + (True, '// OFFSET: LEGO1 0xdeadbeef'), + (True, ' /* Block comment beginning'), + (True, 'Block comment ending */ '), + + # TODO: does clang-format have anything to say about these cases? + (False, 'x++; // Comment folows'), + (False, 'x++; /* Block comment begins'), +] + + +@pytest.mark.parametrize('expected, line', blank_or_comment_param) +def test_is_blank_or_comment(line: str, expected: bool): + assert is_blank_or_comment(line) is expected + + +offset_comment_samples = [ + # (can_parse: bool, exact_match: bool, line: str) + # Should match both expected modules with optional STUB marker + (True, True, '// OFFSET: LEGO1 0xdeadbeef'), + (True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'), + (True, True, '// OFFSET: ISLE 0x12345678'), + (True, True, '// OFFSET: ISLE 0x12345678 STUB'), + + # No trailing spaces allowed + (True, False, '// OFFSET: LEGO1 0xdeadbeef '), + (True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '), + + # Must have exactly one space between elements + (True, False, '//OFFSET: ISLE 0xdeadbeef'), + (True, False, '// OFFSET:ISLE 0xdeadbeef'), + (True, False, '// OFFSET: ISLE 0xdeadbeef'), + (True, False, '// OFFSET: ISLE 0xdeadbeef'), + (True, False, '// OFFSET: ISLE 0xdeadbeef'), + (True, False, '// OFFSET: ISLE 0xdeadbeef STUB'), + + # Must have 0x prefix for hex number + (True, False, '// OFFSET: ISLE deadbeef'), + + # Offset, module name, and STUB must be uppercase + (True, False, '// offset: ISLE 0xdeadbeef'), + (True, False, '// offset: isle 0xdeadbeef'), + (True, False, '// OFFSET: LEGO1 0xdeadbeef stub'), + + # Hex string must be lowercase + (True, False, '// OFFSET: ISLE 0xDEADBEEF'), + + # TODO: How flexible should we be with matching the module name? + (True, True, '// OFFSET: OMNI 0x12345678'), + (True, True, '// OFFSET: LEG01 0x12345678'), + (True, False, '// OFFSET: hello 0x12345678'), + + # Not close enough to match + (False, False, '// OFFSET: ISLE0x12345678'), + (False, False, '// OFFSET: 0x12345678'), + (False, False, '// LEGO1: 0x12345678'), + + # Hex string shorter than 8 characters + (True, True, '// OFFSET: LEGO1 0x1234'), + + # TODO: These match but shouldn't. + # (False, False, '// OFFSET: LEGO1 0'), + # (False, False, '// OFFSET: LEGO1 0x'), +] + + +@pytest.mark.parametrize('match, exact, line', offset_comment_samples) +def test_offset_match(line: str, match: bool, exact): + did_match = match_offset_comment(line) is not None + assert did_match is match + + +@pytest.mark.parametrize('match, exact, line', offset_comment_samples) +def test_exact_offset_comment(line: str, exact: bool, match): + assert is_exact_offset_comment(line) is exact