Refactor checkorder into reusable isledecomp module

This commit is contained in:
disinvite 2023-10-29 18:15:37 -04:00
parent 0c7cee0cd3
commit a1c6cb5dc9
17 changed files with 386 additions and 123 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ ISLE.EXE
LEGO1.DLL
build/
*.swp
*.pyc

View File

@ -1,129 +1,13 @@
import os
import re
import sys
import argparse
from typing import List, Iterator, TextIO
from collections import namedtuple
from enum import Enum
class ReaderState(Enum):
WANT_OFFSET = 0
WANT_SIG = 1
IN_FUNC = 2
CodeBlock = namedtuple('CodeBlock',
['offset', 'signature', 'start_line', 'end_line'])
# To match a reasonable variance of formatting for the offset comment
offsetCommentRegex = re.compile(r'//\s?OFFSET:\s?\w+ (?:0x)?([a-f0-9]+)',
flags=re.I)
# To match the exact syntax (text upper case, hex lower case, with spaces)
# that is used in most places
offsetCommentExactRegex = re.compile(r'// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)')
def is_blank_or_comment(line: str) -> bool:
"""Helper to read ahead adter the offset comment is matched.
There could be blank lines or other comments before the
function signature, and we want to skip those."""
line_strip = line.strip()
return (len(line_strip) == 0
or line_strip.startswith('//')
or line_strip.startswith('/*')
or line_strip.endswith('*/'))
def is_exact_offset_comment(line: str) -> bool:
"""If the offset comment does not match our (unofficial) syntax
we may want to alert the user to fix it for style points."""
return offsetCommentExactRegex.match(line) is not None
def match_offset_comment(line: str) -> str | None:
# TODO: intended to skip the expensive regex match, but is it necessary?
if not line.startswith('//'):
return None
match = offsetCommentRegex.match(line)
return match.group(1) if match is not None else None
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
"""Read the IO stream (file) line-by-line and give the following report:
Foreach code block (function) in the file, what are its starting and
ending line numbers, and what is the given offset in the original
binary. We expect the result to be ordered by line number because we
are reading the file from start to finish."""
blocks = []
offset = None
function_sig = None
start_line = None
state = ReaderState.WANT_OFFSET
for line_no, line in enumerate(stream):
if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
# Naive but reasonable assumption that functions will end with
# a curly brace on its own line with no prepended spaces.
if line.startswith('}'):
# TODO: could streamline this and the next case
block = CodeBlock(offset=offset,
signature=function_sig,
start_line=start_line,
end_line=line_no)
blocks.append(block)
state = ReaderState.WANT_OFFSET
elif match_offset_comment(line) is not None:
# We hit another offset unexpectedly before detecting the
# end of the function. We can recover easily by just
# ending the function here.
block = CodeBlock(offset=offset,
signature=function_sig,
start_line=start_line,
end_line=line_no - 1)
blocks.append(block)
state = ReaderState.WANT_OFFSET
# We want to grab the function signature so we can identify
# the code block. Skip any blank lines or comments
# that follow the offset comment.
elif (not is_blank_or_comment(line)
and state == ReaderState.WANT_SIG):
function_sig = line.strip()
state = ReaderState.IN_FUNC
if state == ReaderState.WANT_OFFSET:
match = match_offset_comment(line)
if match is not None:
offset = int(match, 16)
start_line = line_no
state = ReaderState.WANT_SIG
return blocks
def file_is_cpp(filename: str) -> bool:
# TODO: expand to check header files also?
(basefile, ext) = os.path.splitext(filename)
return ext.lower() == '.cpp'
def walk_source_dir(source: str) -> Iterator[tuple]:
"""Generator to walk the given directory recursively and return
any .cpp files found."""
for subdir, dirs, files in os.walk(source):
for file in files:
if not file_is_cpp(file):
continue
yield os.path.join(subdir, file)
from typing import TextIO
from isledecomp.dir import walk_source_dir
from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import (
match_offset_comment,
is_exact_offset_comment
)
def sig_truncate(sig: str) -> str:

View File

@ -0,0 +1 @@
-e ../isledecomp/

1
tools/isledecomp/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
isledecomp.egg-info/

View File

View File

@ -0,0 +1,17 @@
import os
from typing import Iterator
def file_is_cpp(filename: str) -> bool:
(basefile, ext) = os.path.splitext(filename)
return ext.lower() in ('.h', '.cpp')
def walk_source_dir(source: str) -> Iterator[str]:
"""Generator to walk the given directory recursively and return
any C++ files found."""
for subdir, dirs, files in os.walk(source):
for file in files:
if file_is_cpp(file):
yield os.path.join(subdir, file)

View File

@ -0,0 +1 @@
from .parser import find_code_blocks

View File

@ -0,0 +1,73 @@
# C++ file parser
from typing import List, TextIO
from enum import Enum
from .util import (
CodeBlock,
is_blank_or_comment,
match_offset_comment,
is_exact_offset_comment,
)
class ReaderState(Enum):
WANT_OFFSET = 0
WANT_SIG = 1
IN_FUNC = 2
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
"""Read the IO stream (file) line-by-line and give the following report:
Foreach code block (function) in the file, what are its starting and
ending line numbers, and what is the given offset in the original
binary. We expect the result to be ordered by line number because we
are reading the file from start to finish."""
blocks = []
offset = None
function_sig = None
start_line = None
state = ReaderState.WANT_OFFSET
for line_no, line in enumerate(stream):
if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
# Naive but reasonable assumption that functions will end with
# a curly brace on its own line with no prepended spaces.
if line.startswith('}'):
# TODO: could streamline this and the next case
block = CodeBlock(offset=offset,
signature=function_sig,
start_line=start_line,
end_line=line_no)
blocks.append(block)
state = ReaderState.WANT_OFFSET
elif match_offset_comment(line) is not None:
# We hit another offset unexpectedly before detecting the
# end of the function. We can recover easily by just
# ending the function here.
block = CodeBlock(offset=offset,
signature=function_sig,
start_line=start_line,
end_line=line_no - 1)
blocks.append(block)
state = ReaderState.WANT_OFFSET
# We want to grab the function signature so we can identify
# the code block. Skip any blank lines or comments
# that follow the offset comment.
elif (not is_blank_or_comment(line)
and state == ReaderState.WANT_SIG):
function_sig = line.strip()
state = ReaderState.IN_FUNC
if state == ReaderState.WANT_OFFSET:
match = match_offset_comment(line)
if match is not None:
offset = int(match, 16)
start_line = line_no
state = ReaderState.WANT_SIG
return blocks

View File

@ -0,0 +1,47 @@
# C++ Parser utility functions and data structures
import re
from collections import namedtuple
CodeBlock = namedtuple('CodeBlock',
['offset', 'signature', 'start_line', 'end_line'])
FunctionOffset = namedtuple('FunctionOffset',
['raw', 'address', 'is_stub'])
# To match a reasonable variance of formatting for the offset comment
offsetCommentRegex = re.compile(r'//\s*OFFSET:\s*\w+\s+(?:0x)?([a-f0-9]+)',
flags=re.I)
# To match the exact syntax (text upper case, hex lower case, with spaces)
# that is used in most places
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)(?: STUB)?$')
def is_blank_or_comment(line: str) -> bool:
"""Helper to read ahead after the offset comment is matched.
There could be blank lines or other comments before the
function signature, and we want to skip those."""
line_strip = line.strip()
return (len(line_strip) == 0
or line_strip.startswith('//')
or line_strip.startswith('/*')
or line_strip.endswith('*/'))
def is_exact_offset_comment(line: str) -> bool:
"""If the offset comment does not match our (unofficial) syntax
we may want to alert the user to fix it for style points."""
return offsetCommentExactRegex.match(line) is not None
def match_offset_comment(line: str) -> str | None:
# TODO: intended to skip the expensive regex match, but is it necessary?
# TODO: this will skip indented offsets
if not line.startswith('//'):
return None
match = offsetCommentRegex.match(line)
return match.group(1) if match is not None else None

View File

@ -0,0 +1,9 @@
from setuptools import setup, find_packages
setup(
name='isledecomp',
version='0.1.0',
description='Python tools for the isledecomp project',
packages=find_packages(),
tests_require=['pytest'],
)

View File

View File

@ -0,0 +1,22 @@
// Sample for python unit tests
// Not part of the decomp
// A very simple well-formed code file
// OFFSET: LEGO1 0x1234
void function01()
{
// TODO
}
// OFFSET: LEGO1 0x2345
void function02()
{
// TODO
}
// OFFSET: LEGO1 0x3456
void function03()
{
// TODO
}

View File

@ -0,0 +1,16 @@
// Sample for python unit tests
// Not part of the decomp
#include <stdio.h>
int no_offset_comment()
{
static int dummy = 123;
return -1;
}
// OFFSET: LEGO1 0xdeadbeef
void regular_ole_function()
{
printf("hi there");
}

View File

@ -0,0 +1,12 @@
// Sample for python unit tests
// Not part of the decomp
// OFFSET: LEGO1 0x1234
void short_function() { static char* msg = "oneliner"; }
// OFFSET: LEGO1 0x5555
void function_after_one_liner()
{
// This function comes after the previous that is on a single line.
// Do we report the offset for this one correctly?
}

View File

@ -0,0 +1,20 @@
// Sample for python unit tests
// Not part of the decomp
// OFFSET: LEGO1 0x1001
void function_order01()
{
// TODO
}
// OFFSET: LEGO1 0x1003
void function_order03()
{
// TODO
}
// OFFSET: LEGO1 0x1002
void function_order02()
{
// TODO
}

View File

@ -0,0 +1,72 @@
import os
import pytest
from typing import List, TextIO
from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import CodeBlock
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples')
def sample_file(filename: str) -> TextIO:
"""Wrapper for opening the samples from the directory that does not
depend on the cwd where we run the test"""
full_path = os.path.join(SAMPLE_DIR, filename)
return open(full_path, 'r')
def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
"""Helper to make this more idiomatic"""
just_offsets = [block.offset for block in blocks]
return just_offsets == sorted(just_offsets)
# Tests are below #
def test_sanity():
"""Read a very basic file"""
with sample_file('basic_file.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
assert code_blocks_are_sorted(blocks) is True
# n.b. The parser returns line numbers as 0-based
assert blocks[0].start_line == 5
assert blocks[0].end_line == 9
def test_oneline():
"""(Assuming clang-format permits this) This sample has a function
on a single line. This will test the end-of-function detection"""
with sample_file('oneline_function.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 2
assert blocks[0].start_line == 3
# TODO: Because of the way it works now, this captures the blank line
# as part of the function. That's not *incorrect* per se, but
# this needs to be more consistent if we want the tool to sort the
# code blocks in the file.
assert blocks[0].end_line == 5
def test_missing_offset():
"""What if the function doesn't have an offset comment?"""
with sample_file('missing_offset.cpp') as f:
blocks = find_code_blocks(f)
# TODO: For now, the function without the offset will just be ignored.
# Would be the same outcome if the comment was present but mangled and
# we failed to match it. We should detect these cases in the future.
assert len(blocks) == 1
def test_jumbled_case():
"""The parser just reports what it sees. It is the responsibility of
the downstream tools to do something about a jumbled file.
Just verify that we are reading it correctly."""
with sample_file('out_of_order.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
assert code_blocks_are_sorted(blocks) is False

View File

@ -0,0 +1,87 @@
import pytest
from isledecomp.parser.util import (
is_blank_or_comment,
match_offset_comment,
is_exact_offset_comment,
)
blank_or_comment_param = [
(True, ''),
(True, '\t'),
(True, ' '),
(False, '\tint abc=123;'),
(True, '// OFFSET: LEGO1 0xdeadbeef'),
(True, ' /* Block comment beginning'),
(True, 'Block comment ending */ '),
# TODO: does clang-format have anything to say about these cases?
(False, 'x++; // Comment folows'),
(False, 'x++; /* Block comment begins'),
]
@pytest.mark.parametrize('expected, line', blank_or_comment_param)
def test_is_blank_or_comment(line: str, expected: bool):
assert is_blank_or_comment(line) is expected
offset_comment_samples = [
# (can_parse: bool, exact_match: bool, line: str)
# Should match both expected modules with optional STUB marker
(True, True, '// OFFSET: LEGO1 0xdeadbeef'),
(True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'),
(True, True, '// OFFSET: ISLE 0x12345678'),
(True, True, '// OFFSET: ISLE 0x12345678 STUB'),
# No trailing spaces allowed
(True, False, '// OFFSET: LEGO1 0xdeadbeef '),
(True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '),
# Must have exactly one space between elements
(True, False, '//OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET:ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef STUB'),
# Must have 0x prefix for hex number
(True, False, '// OFFSET: ISLE deadbeef'),
# Offset, module name, and STUB must be uppercase
(True, False, '// offset: ISLE 0xdeadbeef'),
(True, False, '// offset: isle 0xdeadbeef'),
(True, False, '// OFFSET: LEGO1 0xdeadbeef stub'),
# Hex string must be lowercase
(True, False, '// OFFSET: ISLE 0xDEADBEEF'),
# TODO: How flexible should we be with matching the module name?
(True, True, '// OFFSET: OMNI 0x12345678'),
(True, True, '// OFFSET: LEG01 0x12345678'),
(True, False, '// OFFSET: hello 0x12345678'),
# Not close enough to match
(False, False, '// OFFSET: ISLE0x12345678'),
(False, False, '// OFFSET: 0x12345678'),
(False, False, '// LEGO1: 0x12345678'),
# Hex string shorter than 8 characters
(True, True, '// OFFSET: LEGO1 0x1234'),
# TODO: These match but shouldn't.
# (False, False, '// OFFSET: LEGO1 0'),
# (False, False, '// OFFSET: LEGO1 0x'),
]
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
def test_offset_match(line: str, match: bool, exact):
did_match = match_offset_comment(line) is not None
assert did_match is match
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
def test_exact_offset_comment(line: str, exact: bool, match):
assert is_exact_offset_comment(line) is exact