mirror of
https://github.com/isledecomp/isle.git
synced 2026-01-23 16:21:15 +00:00
Refactor checkorder into reusable isledecomp module
This commit is contained in:
parent
0c7cee0cd3
commit
a1c6cb5dc9
1
.gitignore
vendored
1
.gitignore
vendored
@ -6,3 +6,4 @@ ISLE.EXE
|
||||
LEGO1.DLL
|
||||
build/
|
||||
*.swp
|
||||
*.pyc
|
||||
@ -1,129 +1,13 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
from typing import List, Iterator, TextIO
|
||||
from collections import namedtuple
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ReaderState(Enum):
|
||||
WANT_OFFSET = 0
|
||||
WANT_SIG = 1
|
||||
IN_FUNC = 2
|
||||
|
||||
|
||||
CodeBlock = namedtuple('CodeBlock',
|
||||
['offset', 'signature', 'start_line', 'end_line'])
|
||||
|
||||
# To match a reasonable variance of formatting for the offset comment
|
||||
offsetCommentRegex = re.compile(r'//\s?OFFSET:\s?\w+ (?:0x)?([a-f0-9]+)',
|
||||
flags=re.I)
|
||||
|
||||
# To match the exact syntax (text upper case, hex lower case, with spaces)
|
||||
# that is used in most places
|
||||
offsetCommentExactRegex = re.compile(r'// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)')
|
||||
|
||||
|
||||
def is_blank_or_comment(line: str) -> bool:
|
||||
"""Helper to read ahead adter the offset comment is matched.
|
||||
There could be blank lines or other comments before the
|
||||
function signature, and we want to skip those."""
|
||||
line_strip = line.strip()
|
||||
return (len(line_strip) == 0
|
||||
or line_strip.startswith('//')
|
||||
or line_strip.startswith('/*')
|
||||
or line_strip.endswith('*/'))
|
||||
|
||||
|
||||
def is_exact_offset_comment(line: str) -> bool:
|
||||
"""If the offset comment does not match our (unofficial) syntax
|
||||
we may want to alert the user to fix it for style points."""
|
||||
return offsetCommentExactRegex.match(line) is not None
|
||||
|
||||
|
||||
def match_offset_comment(line: str) -> str | None:
|
||||
# TODO: intended to skip the expensive regex match, but is it necessary?
|
||||
if not line.startswith('//'):
|
||||
return None
|
||||
|
||||
match = offsetCommentRegex.match(line)
|
||||
return match.group(1) if match is not None else None
|
||||
|
||||
|
||||
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
|
||||
"""Read the IO stream (file) line-by-line and give the following report:
|
||||
Foreach code block (function) in the file, what are its starting and
|
||||
ending line numbers, and what is the given offset in the original
|
||||
binary. We expect the result to be ordered by line number because we
|
||||
are reading the file from start to finish."""
|
||||
|
||||
blocks = []
|
||||
|
||||
offset = None
|
||||
function_sig = None
|
||||
start_line = None
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
for line_no, line in enumerate(stream):
|
||||
if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
|
||||
# Naive but reasonable assumption that functions will end with
|
||||
# a curly brace on its own line with no prepended spaces.
|
||||
if line.startswith('}'):
|
||||
# TODO: could streamline this and the next case
|
||||
block = CodeBlock(offset=offset,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=line_no)
|
||||
|
||||
blocks.append(block)
|
||||
state = ReaderState.WANT_OFFSET
|
||||
elif match_offset_comment(line) is not None:
|
||||
# We hit another offset unexpectedly before detecting the
|
||||
# end of the function. We can recover easily by just
|
||||
# ending the function here.
|
||||
block = CodeBlock(offset=offset,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=line_no - 1)
|
||||
|
||||
blocks.append(block)
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
# We want to grab the function signature so we can identify
|
||||
# the code block. Skip any blank lines or comments
|
||||
# that follow the offset comment.
|
||||
elif (not is_blank_or_comment(line)
|
||||
and state == ReaderState.WANT_SIG):
|
||||
function_sig = line.strip()
|
||||
state = ReaderState.IN_FUNC
|
||||
|
||||
if state == ReaderState.WANT_OFFSET:
|
||||
match = match_offset_comment(line)
|
||||
if match is not None:
|
||||
offset = int(match, 16)
|
||||
start_line = line_no
|
||||
state = ReaderState.WANT_SIG
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def file_is_cpp(filename: str) -> bool:
|
||||
# TODO: expand to check header files also?
|
||||
(basefile, ext) = os.path.splitext(filename)
|
||||
return ext.lower() == '.cpp'
|
||||
|
||||
|
||||
def walk_source_dir(source: str) -> Iterator[tuple]:
|
||||
"""Generator to walk the given directory recursively and return
|
||||
any .cpp files found."""
|
||||
|
||||
for subdir, dirs, files in os.walk(source):
|
||||
for file in files:
|
||||
if not file_is_cpp(file):
|
||||
continue
|
||||
|
||||
yield os.path.join(subdir, file)
|
||||
from typing import TextIO
|
||||
from isledecomp.dir import walk_source_dir
|
||||
from isledecomp.parser import find_code_blocks
|
||||
from isledecomp.parser.util import (
|
||||
match_offset_comment,
|
||||
is_exact_offset_comment
|
||||
)
|
||||
|
||||
|
||||
def sig_truncate(sig: str) -> str:
|
||||
|
||||
1
tools/checkorder/requirements.txt
Normal file
1
tools/checkorder/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
-e ../isledecomp/
|
||||
1
tools/isledecomp/.gitignore
vendored
Normal file
1
tools/isledecomp/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
isledecomp.egg-info/
|
||||
0
tools/isledecomp/isledecomp/__init__.py
Normal file
0
tools/isledecomp/isledecomp/__init__.py
Normal file
17
tools/isledecomp/isledecomp/dir.py
Normal file
17
tools/isledecomp/isledecomp/dir.py
Normal file
@ -0,0 +1,17 @@
|
||||
import os
|
||||
from typing import Iterator
|
||||
|
||||
|
||||
def file_is_cpp(filename: str) -> bool:
|
||||
(basefile, ext) = os.path.splitext(filename)
|
||||
return ext.lower() in ('.h', '.cpp')
|
||||
|
||||
|
||||
def walk_source_dir(source: str) -> Iterator[str]:
|
||||
"""Generator to walk the given directory recursively and return
|
||||
any C++ files found."""
|
||||
|
||||
for subdir, dirs, files in os.walk(source):
|
||||
for file in files:
|
||||
if file_is_cpp(file):
|
||||
yield os.path.join(subdir, file)
|
||||
1
tools/isledecomp/isledecomp/parser/__init__.py
Normal file
1
tools/isledecomp/isledecomp/parser/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .parser import find_code_blocks
|
||||
73
tools/isledecomp/isledecomp/parser/parser.py
Normal file
73
tools/isledecomp/isledecomp/parser/parser.py
Normal file
@ -0,0 +1,73 @@
|
||||
# C++ file parser
|
||||
|
||||
from typing import List, TextIO
|
||||
from enum import Enum
|
||||
from .util import (
|
||||
CodeBlock,
|
||||
is_blank_or_comment,
|
||||
match_offset_comment,
|
||||
is_exact_offset_comment,
|
||||
)
|
||||
|
||||
|
||||
class ReaderState(Enum):
|
||||
WANT_OFFSET = 0
|
||||
WANT_SIG = 1
|
||||
IN_FUNC = 2
|
||||
|
||||
|
||||
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
|
||||
"""Read the IO stream (file) line-by-line and give the following report:
|
||||
Foreach code block (function) in the file, what are its starting and
|
||||
ending line numbers, and what is the given offset in the original
|
||||
binary. We expect the result to be ordered by line number because we
|
||||
are reading the file from start to finish."""
|
||||
|
||||
blocks = []
|
||||
|
||||
offset = None
|
||||
function_sig = None
|
||||
start_line = None
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
for line_no, line in enumerate(stream):
|
||||
if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
|
||||
# Naive but reasonable assumption that functions will end with
|
||||
# a curly brace on its own line with no prepended spaces.
|
||||
if line.startswith('}'):
|
||||
# TODO: could streamline this and the next case
|
||||
block = CodeBlock(offset=offset,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=line_no)
|
||||
|
||||
blocks.append(block)
|
||||
state = ReaderState.WANT_OFFSET
|
||||
elif match_offset_comment(line) is not None:
|
||||
# We hit another offset unexpectedly before detecting the
|
||||
# end of the function. We can recover easily by just
|
||||
# ending the function here.
|
||||
block = CodeBlock(offset=offset,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=line_no - 1)
|
||||
|
||||
blocks.append(block)
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
# We want to grab the function signature so we can identify
|
||||
# the code block. Skip any blank lines or comments
|
||||
# that follow the offset comment.
|
||||
elif (not is_blank_or_comment(line)
|
||||
and state == ReaderState.WANT_SIG):
|
||||
function_sig = line.strip()
|
||||
state = ReaderState.IN_FUNC
|
||||
|
||||
if state == ReaderState.WANT_OFFSET:
|
||||
match = match_offset_comment(line)
|
||||
if match is not None:
|
||||
offset = int(match, 16)
|
||||
start_line = line_no
|
||||
state = ReaderState.WANT_SIG
|
||||
|
||||
return blocks
|
||||
47
tools/isledecomp/isledecomp/parser/util.py
Normal file
47
tools/isledecomp/isledecomp/parser/util.py
Normal file
@ -0,0 +1,47 @@
|
||||
# C++ Parser utility functions and data structures
|
||||
import re
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
CodeBlock = namedtuple('CodeBlock',
|
||||
['offset', 'signature', 'start_line', 'end_line'])
|
||||
|
||||
|
||||
FunctionOffset = namedtuple('FunctionOffset',
|
||||
['raw', 'address', 'is_stub'])
|
||||
|
||||
|
||||
# To match a reasonable variance of formatting for the offset comment
|
||||
offsetCommentRegex = re.compile(r'//\s*OFFSET:\s*\w+\s+(?:0x)?([a-f0-9]+)',
|
||||
flags=re.I)
|
||||
|
||||
# To match the exact syntax (text upper case, hex lower case, with spaces)
|
||||
# that is used in most places
|
||||
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)(?: STUB)?$')
|
||||
|
||||
|
||||
def is_blank_or_comment(line: str) -> bool:
|
||||
"""Helper to read ahead after the offset comment is matched.
|
||||
There could be blank lines or other comments before the
|
||||
function signature, and we want to skip those."""
|
||||
line_strip = line.strip()
|
||||
return (len(line_strip) == 0
|
||||
or line_strip.startswith('//')
|
||||
or line_strip.startswith('/*')
|
||||
or line_strip.endswith('*/'))
|
||||
|
||||
|
||||
def is_exact_offset_comment(line: str) -> bool:
|
||||
"""If the offset comment does not match our (unofficial) syntax
|
||||
we may want to alert the user to fix it for style points."""
|
||||
return offsetCommentExactRegex.match(line) is not None
|
||||
|
||||
|
||||
def match_offset_comment(line: str) -> str | None:
|
||||
# TODO: intended to skip the expensive regex match, but is it necessary?
|
||||
# TODO: this will skip indented offsets
|
||||
if not line.startswith('//'):
|
||||
return None
|
||||
|
||||
match = offsetCommentRegex.match(line)
|
||||
return match.group(1) if match is not None else None
|
||||
9
tools/isledecomp/setup.py
Normal file
9
tools/isledecomp/setup.py
Normal file
@ -0,0 +1,9 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='isledecomp',
|
||||
version='0.1.0',
|
||||
description='Python tools for the isledecomp project',
|
||||
packages=find_packages(),
|
||||
tests_require=['pytest'],
|
||||
)
|
||||
0
tools/isledecomp/tests/__init__.py
Normal file
0
tools/isledecomp/tests/__init__.py
Normal file
22
tools/isledecomp/tests/samples/basic_file.cpp
Normal file
22
tools/isledecomp/tests/samples/basic_file.cpp
Normal file
@ -0,0 +1,22 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// A very simple well-formed code file
|
||||
|
||||
// OFFSET: LEGO1 0x1234
|
||||
void function01()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: LEGO1 0x2345
|
||||
void function02()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: LEGO1 0x3456
|
||||
void function03()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
16
tools/isledecomp/tests/samples/missing_offset.cpp
Normal file
16
tools/isledecomp/tests/samples/missing_offset.cpp
Normal file
@ -0,0 +1,16 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
int no_offset_comment()
|
||||
{
|
||||
static int dummy = 123;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// OFFSET: LEGO1 0xdeadbeef
|
||||
void regular_ole_function()
|
||||
{
|
||||
printf("hi there");
|
||||
}
|
||||
12
tools/isledecomp/tests/samples/oneline_function.cpp
Normal file
12
tools/isledecomp/tests/samples/oneline_function.cpp
Normal file
@ -0,0 +1,12 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// OFFSET: LEGO1 0x1234
|
||||
void short_function() { static char* msg = "oneliner"; }
|
||||
|
||||
// OFFSET: LEGO1 0x5555
|
||||
void function_after_one_liner()
|
||||
{
|
||||
// This function comes after the previous that is on a single line.
|
||||
// Do we report the offset for this one correctly?
|
||||
}
|
||||
20
tools/isledecomp/tests/samples/out_of_order.cpp
Normal file
20
tools/isledecomp/tests/samples/out_of_order.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// OFFSET: LEGO1 0x1001
|
||||
void function_order01()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: LEGO1 0x1003
|
||||
void function_order03()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: LEGO1 0x1002
|
||||
void function_order02()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
72
tools/isledecomp/tests/test_parser.py
Normal file
72
tools/isledecomp/tests/test_parser.py
Normal file
@ -0,0 +1,72 @@
|
||||
import os
|
||||
import pytest
|
||||
from typing import List, TextIO
|
||||
from isledecomp.parser import find_code_blocks
|
||||
from isledecomp.parser.util import CodeBlock
|
||||
|
||||
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
|
||||
|
||||
def sample_file(filename: str) -> TextIO:
|
||||
"""Wrapper for opening the samples from the directory that does not
|
||||
depend on the cwd where we run the test"""
|
||||
full_path = os.path.join(SAMPLE_DIR, filename)
|
||||
return open(full_path, 'r')
|
||||
|
||||
|
||||
def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
|
||||
"""Helper to make this more idiomatic"""
|
||||
just_offsets = [block.offset for block in blocks]
|
||||
return just_offsets == sorted(just_offsets)
|
||||
|
||||
|
||||
# Tests are below #
|
||||
|
||||
|
||||
def test_sanity():
|
||||
"""Read a very basic file"""
|
||||
with sample_file('basic_file.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 3
|
||||
assert code_blocks_are_sorted(blocks) is True
|
||||
# n.b. The parser returns line numbers as 0-based
|
||||
assert blocks[0].start_line == 5
|
||||
assert blocks[0].end_line == 9
|
||||
|
||||
|
||||
def test_oneline():
|
||||
"""(Assuming clang-format permits this) This sample has a function
|
||||
on a single line. This will test the end-of-function detection"""
|
||||
with sample_file('oneline_function.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 2
|
||||
assert blocks[0].start_line == 3
|
||||
# TODO: Because of the way it works now, this captures the blank line
|
||||
# as part of the function. That's not *incorrect* per se, but
|
||||
# this needs to be more consistent if we want the tool to sort the
|
||||
# code blocks in the file.
|
||||
assert blocks[0].end_line == 5
|
||||
|
||||
|
||||
def test_missing_offset():
|
||||
"""What if the function doesn't have an offset comment?"""
|
||||
with sample_file('missing_offset.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
# TODO: For now, the function without the offset will just be ignored.
|
||||
# Would be the same outcome if the comment was present but mangled and
|
||||
# we failed to match it. We should detect these cases in the future.
|
||||
assert len(blocks) == 1
|
||||
|
||||
|
||||
def test_jumbled_case():
|
||||
"""The parser just reports what it sees. It is the responsibility of
|
||||
the downstream tools to do something about a jumbled file.
|
||||
Just verify that we are reading it correctly."""
|
||||
with sample_file('out_of_order.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 3
|
||||
assert code_blocks_are_sorted(blocks) is False
|
||||
87
tools/isledecomp/tests/test_parser_util.py
Normal file
87
tools/isledecomp/tests/test_parser_util.py
Normal file
@ -0,0 +1,87 @@
|
||||
import pytest
|
||||
from isledecomp.parser.util import (
|
||||
is_blank_or_comment,
|
||||
match_offset_comment,
|
||||
is_exact_offset_comment,
|
||||
)
|
||||
|
||||
|
||||
blank_or_comment_param = [
|
||||
(True, ''),
|
||||
(True, '\t'),
|
||||
(True, ' '),
|
||||
(False, '\tint abc=123;'),
|
||||
(True, '// OFFSET: LEGO1 0xdeadbeef'),
|
||||
(True, ' /* Block comment beginning'),
|
||||
(True, 'Block comment ending */ '),
|
||||
|
||||
# TODO: does clang-format have anything to say about these cases?
|
||||
(False, 'x++; // Comment folows'),
|
||||
(False, 'x++; /* Block comment begins'),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('expected, line', blank_or_comment_param)
|
||||
def test_is_blank_or_comment(line: str, expected: bool):
|
||||
assert is_blank_or_comment(line) is expected
|
||||
|
||||
|
||||
offset_comment_samples = [
|
||||
# (can_parse: bool, exact_match: bool, line: str)
|
||||
# Should match both expected modules with optional STUB marker
|
||||
(True, True, '// OFFSET: LEGO1 0xdeadbeef'),
|
||||
(True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'),
|
||||
(True, True, '// OFFSET: ISLE 0x12345678'),
|
||||
(True, True, '// OFFSET: ISLE 0x12345678 STUB'),
|
||||
|
||||
# No trailing spaces allowed
|
||||
(True, False, '// OFFSET: LEGO1 0xdeadbeef '),
|
||||
(True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '),
|
||||
|
||||
# Must have exactly one space between elements
|
||||
(True, False, '//OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET:ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef STUB'),
|
||||
|
||||
# Must have 0x prefix for hex number
|
||||
(True, False, '// OFFSET: ISLE deadbeef'),
|
||||
|
||||
# Offset, module name, and STUB must be uppercase
|
||||
(True, False, '// offset: ISLE 0xdeadbeef'),
|
||||
(True, False, '// offset: isle 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: LEGO1 0xdeadbeef stub'),
|
||||
|
||||
# Hex string must be lowercase
|
||||
(True, False, '// OFFSET: ISLE 0xDEADBEEF'),
|
||||
|
||||
# TODO: How flexible should we be with matching the module name?
|
||||
(True, True, '// OFFSET: OMNI 0x12345678'),
|
||||
(True, True, '// OFFSET: LEG01 0x12345678'),
|
||||
(True, False, '// OFFSET: hello 0x12345678'),
|
||||
|
||||
# Not close enough to match
|
||||
(False, False, '// OFFSET: ISLE0x12345678'),
|
||||
(False, False, '// OFFSET: 0x12345678'),
|
||||
(False, False, '// LEGO1: 0x12345678'),
|
||||
|
||||
# Hex string shorter than 8 characters
|
||||
(True, True, '// OFFSET: LEGO1 0x1234'),
|
||||
|
||||
# TODO: These match but shouldn't.
|
||||
# (False, False, '// OFFSET: LEGO1 0'),
|
||||
# (False, False, '// OFFSET: LEGO1 0x'),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
|
||||
def test_offset_match(line: str, match: bool, exact):
|
||||
did_match = match_offset_comment(line) is not None
|
||||
assert did_match is match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
|
||||
def test_exact_offset_comment(line: str, exact: bool, match):
|
||||
assert is_exact_offset_comment(line) is exact
|
||||
Loading…
Reference in New Issue
Block a user