mirror of
https://github.com/isledecomp/isle.git
synced 2026-01-23 16:21:15 +00:00
Refactor of state machine to prepare for reccmp integration
This commit is contained in:
parent
cb7e223b63
commit
7b61b33c98
@ -2,7 +2,10 @@
|
||||
import sys
|
||||
import argparse
|
||||
from typing import TextIO
|
||||
from isledecomp.dir import walk_source_dir
|
||||
from isledecomp.dir import (
|
||||
walk_source_dir,
|
||||
file_is_cpp
|
||||
)
|
||||
from isledecomp.parser import find_code_blocks
|
||||
from isledecomp.parser.util import (
|
||||
is_exact_offset_comment
|
||||
|
||||
@ -7,6 +7,8 @@
|
||||
is_blank_or_comment,
|
||||
match_offset_comment,
|
||||
is_exact_offset_comment,
|
||||
template_function_name,
|
||||
remove_trailing_comment,
|
||||
)
|
||||
|
||||
|
||||
@ -14,6 +16,9 @@ class ReaderState(Enum):
|
||||
WANT_OFFSET = 0
|
||||
WANT_SIG = 1
|
||||
IN_FUNC = 2
|
||||
IN_TEMPLATE = 3
|
||||
WANT_CURLY = 4
|
||||
FUNCTION_DONE = 5
|
||||
|
||||
|
||||
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
|
||||
@ -25,53 +30,106 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
|
||||
|
||||
blocks = []
|
||||
|
||||
offset = None
|
||||
offset_match = None
|
||||
offset_comment = None
|
||||
function_sig = None
|
||||
start_line = None
|
||||
end_line = None
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
for line_no, line in enumerate(stream):
|
||||
if state in (ReaderState.WANT_SIG, ReaderState.IN_FUNC):
|
||||
# 1-based to match cvdump and your text-editor
|
||||
# I know it says 0, but we will increment before each readline()
|
||||
line_no = 0
|
||||
can_seek = True
|
||||
|
||||
while True:
|
||||
# Do this before reading again so that an EOF will not
|
||||
# cause us to miss the last function of the file.
|
||||
if state == ReaderState.FUNCTION_DONE:
|
||||
block = CodeBlock(offset=offset_match.address,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
offset_comment=offset_comment,
|
||||
module=offset_match.module,
|
||||
is_template=offset_match.is_template,
|
||||
is_stub=offset_match.is_stub)
|
||||
blocks.append(block)
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
if can_seek:
|
||||
line_no += 1
|
||||
line = stream.readline()
|
||||
if line == '':
|
||||
break
|
||||
|
||||
if (state != ReaderState.WANT_OFFSET and
|
||||
match_offset_comment(line) is not None):
|
||||
# We hit another offset unexpectedly.
|
||||
# We can recover easily by just ending the function here.
|
||||
end_line = line_no - 1
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
|
||||
# Pause reading here so we handle the offset marker
|
||||
# on the next loop iteration
|
||||
can_seek = False
|
||||
|
||||
# Regular state machine handling begins now
|
||||
if state == ReaderState.IN_TEMPLATE:
|
||||
# TEMPLATE functions are a special case. The signature is
|
||||
# given on the next line (in a // comment)
|
||||
function_sig = template_function_name(line)
|
||||
start_line = line_no
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
|
||||
elif state == ReaderState.WANT_SIG:
|
||||
# Skip blank lines or comments that come after the offset
|
||||
# marker. There is not a formal procedure for this, so just
|
||||
# assume the next "code line" is the function signature
|
||||
if not is_blank_or_comment(line):
|
||||
function_sig = remove_trailing_comment(line.strip())
|
||||
|
||||
# Now check to see if the opening curly bracket is on the
|
||||
# same line. clang-format should prevent this (BraceWrapping)
|
||||
# but it is easy to detect.
|
||||
# If the entire function is on one line, we can handle that
|
||||
# too, although this should be limited to inlines.
|
||||
if function_sig.endswith('{'):
|
||||
start_line = line_no
|
||||
state = ReaderState.IN_FUNC
|
||||
elif (function_sig.endswith('}') or
|
||||
function_sig.endswith('};')):
|
||||
start_line = line_no
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
else:
|
||||
state = ReaderState.WANT_CURLY
|
||||
|
||||
elif state == ReaderState.WANT_CURLY:
|
||||
if line.strip() == '{':
|
||||
start_line = line_no
|
||||
state = ReaderState.IN_FUNC
|
||||
|
||||
elif state == ReaderState.IN_FUNC:
|
||||
# Naive but reasonable assumption that functions will end with
|
||||
# a curly brace on its own line with no prepended spaces.
|
||||
if line.startswith('}'):
|
||||
# TODO: could streamline this and the next case
|
||||
block = CodeBlock(offset=offset,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=line_no,
|
||||
offset_comment=offset_comment)
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
|
||||
blocks.append(block)
|
||||
state = ReaderState.WANT_OFFSET
|
||||
elif match_offset_comment(line) is not None:
|
||||
# We hit another offset unexpectedly before detecting the
|
||||
# end of the function. We can recover easily by just
|
||||
# ending the function here.
|
||||
block = CodeBlock(offset=offset,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=line_no - 1,
|
||||
offset_comment=offset_comment)
|
||||
|
||||
blocks.append(block)
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
# We want to grab the function signature so we can identify
|
||||
# the code block. Skip any blank lines or comments
|
||||
# that follow the offset comment.
|
||||
elif (not is_blank_or_comment(line)
|
||||
and state == ReaderState.WANT_SIG):
|
||||
function_sig = line.strip()
|
||||
state = ReaderState.IN_FUNC
|
||||
|
||||
if state == ReaderState.WANT_OFFSET:
|
||||
elif state == ReaderState.WANT_OFFSET:
|
||||
# If we detected an offset marker unexpectedly, we are handling
|
||||
# it here so we can continue seeking.
|
||||
can_seek = True
|
||||
match = match_offset_comment(line)
|
||||
if match is not None:
|
||||
offset = int(match, 16)
|
||||
offset_match = match
|
||||
offset_comment = line.strip()
|
||||
start_line = line_no
|
||||
state = ReaderState.WANT_SIG
|
||||
|
||||
if match.is_template:
|
||||
state = ReaderState.IN_TEMPLATE
|
||||
else:
|
||||
state = ReaderState.WANT_SIG
|
||||
|
||||
return blocks
|
||||
|
||||
@ -5,17 +5,47 @@
|
||||
|
||||
CodeBlock = namedtuple('CodeBlock',
|
||||
['offset', 'signature', 'start_line', 'end_line',
|
||||
'offset_comment'])
|
||||
'offset_comment', 'module', 'is_template', 'is_stub'])
|
||||
|
||||
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address',
|
||||
'is_template', 'is_stub'])
|
||||
|
||||
# This has not been formally established, but considering that "STUB"
|
||||
# is a temporary state for a function, we assume it will appear last,
|
||||
# after any other modifiers (i.e. TEMPLATE)
|
||||
|
||||
# To match a reasonable variance of formatting for the offset comment
|
||||
offsetCommentRegex = re.compile(r'//\s*OFFSET:\s*\w+\s+(?:0x)?([a-f0-9]+)(\s+STUB)?',
|
||||
offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8
|
||||
flags=re.I)
|
||||
|
||||
# To match the exact syntax (text upper case, hex lower case, with spaces)
|
||||
# that is used in most places
|
||||
# TODO: template and stub mutually exclusive?
|
||||
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)(?: STUB| TEMPLATE)?$')
|
||||
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8
|
||||
|
||||
|
||||
# The goal here is to just read whatever is on the next line, so some
|
||||
# flexibility in the formatting seems OK
|
||||
templateCommentRegex = re.compile(r'\s*//\s+(.*)')
|
||||
|
||||
|
||||
# To remove any comment (//) or block comment (/*) and its leading spaces
|
||||
# from the end of a code line
|
||||
trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
|
||||
|
||||
|
||||
def template_function_name(line: str) -> str:
|
||||
"""Parse function signature for special TEMPLATE functions"""
|
||||
template_match = templateCommentRegex.match(line)
|
||||
|
||||
# If we don't match, you get whatever is on the line as the signature
|
||||
if template_match is not None:
|
||||
return template_match.group(1)
|
||||
else:
|
||||
return line
|
||||
|
||||
|
||||
def remove_trailing_comment(line: str) -> str:
|
||||
return trailingCommentRegex.sub('', line)
|
||||
|
||||
|
||||
def is_blank_or_comment(line: str) -> bool:
|
||||
@ -35,11 +65,12 @@ def is_exact_offset_comment(line: str) -> bool:
|
||||
return offsetCommentExactRegex.match(line) is not None
|
||||
|
||||
|
||||
def match_offset_comment(line: str) -> str | None:
|
||||
# TODO: intended to skip the expensive regex match, but is it necessary?
|
||||
# TODO: this will skip indented offsets
|
||||
if not line.startswith('//'):
|
||||
def match_offset_comment(line: str) -> OffsetMatch | None:
|
||||
match = offsetCommentRegex.match(line)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
match = offsetCommentRegex.match(line)
|
||||
return match.group(1) if match is not None else None
|
||||
return OffsetMatch(module=match.group(1),
|
||||
address=int(match.group(2), 16),
|
||||
is_template=match.group(3) is not None,
|
||||
is_stub=match.group(4) is not None)
|
||||
|
||||
29
tools/isledecomp/tests/samples/basic_class.cpp
Normal file
29
tools/isledecomp/tests/samples/basic_class.cpp
Normal file
@ -0,0 +1,29 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// A very simple class
|
||||
|
||||
class TestClass {
|
||||
public:
|
||||
TestClass();
|
||||
virtual ~TestClass() override;
|
||||
|
||||
virtual MxResult Tickle() override; // vtable+08
|
||||
|
||||
// OFFSET: LEGO1 0x12345678
|
||||
inline const char* ClassName() const // vtable+0c
|
||||
{
|
||||
// 0xabcd1234
|
||||
return "TestClass";
|
||||
}
|
||||
|
||||
// OFFSET: LEGO1 0xdeadbeef
|
||||
inline MxBool IsA(const char* name) const override // vtable+10
|
||||
{
|
||||
return !strcmp(name, TestClass::ClassName());
|
||||
}
|
||||
|
||||
private:
|
||||
int m_hello;
|
||||
int m_hiThere;
|
||||
};
|
||||
8
tools/isledecomp/tests/samples/inline.cpp
Normal file
8
tools/isledecomp/tests/samples/inline.cpp
Normal file
@ -0,0 +1,8 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// OFFSET: LEGO1 0x10000001
|
||||
inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
|
||||
|
||||
// OFFSET: LEGO1 0x10000002
|
||||
inline const char* OneLine() const { return "MxDSObject"; };
|
||||
23
tools/isledecomp/tests/samples/poorly_formatted.cpp
Normal file
23
tools/isledecomp/tests/samples/poorly_formatted.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// While it's reasonable to expect a well-formed file (and clang-format
|
||||
// will make sure we get one), this will put the parser through its paces.
|
||||
|
||||
// OFFSET: TEST 0x1234
|
||||
void curly_with_spaces()
|
||||
{
|
||||
static char* msg = "hello";
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x5555
|
||||
void weird_closing_curly()
|
||||
{
|
||||
int x = 123; }
|
||||
|
||||
// OFFSET: HELLO 0x5656
|
||||
void bad_indenting() {
|
||||
if (0)
|
||||
{
|
||||
int y = 5;
|
||||
}}
|
||||
@ -30,9 +30,10 @@ def test_sanity():
|
||||
|
||||
assert len(blocks) == 3
|
||||
assert code_blocks_are_sorted(blocks) is True
|
||||
# n.b. The parser returns line numbers as 0-based
|
||||
assert blocks[0].start_line == 5
|
||||
assert blocks[0].end_line == 9
|
||||
# n.b. The parser returns line numbers as 1-based
|
||||
# Function starts when we see the opening curly brace
|
||||
assert blocks[0].start_line == 8
|
||||
assert blocks[0].end_line == 10
|
||||
|
||||
|
||||
def test_oneline():
|
||||
@ -42,11 +43,7 @@ def test_oneline():
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 2
|
||||
assert blocks[0].start_line == 3
|
||||
# TODO: Because of the way it works now, this captures the blank line
|
||||
# as part of the function. That's not *incorrect* per se, but
|
||||
# this needs to be more consistent if we want the tool to sort the
|
||||
# code blocks in the file.
|
||||
assert blocks[0].start_line == 5
|
||||
assert blocks[0].end_line == 5
|
||||
|
||||
|
||||
@ -70,3 +67,39 @@ def test_jumbled_case():
|
||||
|
||||
assert len(blocks) == 3
|
||||
assert code_blocks_are_sorted(blocks) is False
|
||||
|
||||
|
||||
def test_bad_file():
|
||||
with sample_file('poorly_formatted.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 3
|
||||
|
||||
|
||||
def test_indented():
|
||||
"""Offsets for functions inside of a class will probably be indented."""
|
||||
with sample_file('basic_class.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
# TODO: We don't properly detect the end of these functions
|
||||
# because the closing brace is indented. However... knowing where each
|
||||
# function ends is less important (for now) than capturing
|
||||
# all the functions that are there.
|
||||
|
||||
assert len(blocks) == 2
|
||||
assert blocks[0].offset == int('0x12345678', 16)
|
||||
assert blocks[0].start_line == 15
|
||||
#assert blocks[0].end_line == 18
|
||||
|
||||
assert blocks[1].offset == int('0xdeadbeef', 16)
|
||||
assert blocks[1].start_line == 22
|
||||
#assert blocks[1].end_line == 24
|
||||
|
||||
def test_inline():
|
||||
with sample_file('inline.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 2
|
||||
for block in blocks:
|
||||
assert block.start_line is not None
|
||||
assert block.start_line == block.end_line
|
||||
|
||||
Loading…
Reference in New Issue
Block a user