Allow multiple offset markers, pep8 cleanup

This commit is contained in:
disinvite 2023-11-14 22:25:37 -05:00
parent f734d2733d
commit 09688ed83f
5 changed files with 142 additions and 47 deletions

View File

@ -10,6 +10,7 @@
is_exact_offset_comment,
template_function_name,
remove_trailing_comment,
distinct_module,
)
@ -31,11 +32,8 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
blocks = []
offset_match = OffsetMatch(module=None,
address=None,
is_template=None,
is_stub=None)
offset_comment = None
offset_matches = []
function_sig = None
start_line = None
end_line = None
@ -50,15 +48,19 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
# Do this before reading again so that an EOF will not
# cause us to miss the last function of the file.
if state == ReaderState.FUNCTION_DONE:
block = CodeBlock(offset=offset_match.address,
signature=function_sig,
start_line=start_line,
end_line=end_line,
offset_comment=offset_comment,
module=offset_match.module,
is_template=offset_match.is_template,
is_stub=offset_match.is_stub)
blocks.append(block)
# Our list of offset marks could have duplicates on
# module name, so we'll eliminate those now.
for offset_match in distinct_module(offset_matches):
block = CodeBlock(offset=offset_match.address,
signature=function_sig,
start_line=start_line,
end_line=end_line,
offset_comment=offset_match.comment,
module=offset_match.module,
is_template=offset_match.is_template,
is_stub=offset_match.is_stub)
blocks.append(block)
offset_matches = []
state = ReaderState.WANT_OFFSET
if can_seek:
@ -67,19 +69,33 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
if line == '':
break
if (state != ReaderState.WANT_OFFSET and
match_offset_comment(line) is not None):
# We hit another offset unexpectedly.
# We can recover easily by just ending the function here.
end_line = line_no - 1
state = ReaderState.FUNCTION_DONE
new_match = match_offset_comment(line)
if new_match is not None:
# We will allow multiple offsets if we have just begun
# the code block, but not after we hit the curly brace.
if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
ReaderState.WANT_SIG):
# If we detected an offset marker unexpectedly,
# we are handling it here so we can continue seeking.
can_seek = True
# Pause reading here so we handle the offset marker
# on the next loop iteration
can_seek = False
offset_matches.append(new_match)
# Regular state machine handling begins now
if state == ReaderState.IN_TEMPLATE:
if new_match.is_template:
state = ReaderState.IN_TEMPLATE
else:
state = ReaderState.WANT_SIG
else:
# We hit another offset unexpectedly.
# We can recover easily by just ending the function here.
end_line = line_no - 1
state = ReaderState.FUNCTION_DONE
# Pause reading here so we handle the offset marker
# on the next loop iteration
can_seek = False
elif state == ReaderState.IN_TEMPLATE:
# TEMPLATE functions are a special case. The signature is
# given on the next line (in a // comment)
function_sig = template_function_name(line)
@ -92,13 +108,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
# marker. There is not a formal procedure for this, so just
# assume the next "code line" is the function signature
if not is_blank_or_comment(line):
# Inline functions may end with a comment. Strip that out
# to help parsing.
function_sig = remove_trailing_comment(line.strip())
# Now check to see if the opening curly bracket is on the
# same line. clang-format should prevent this (BraceWrapping)
# but it is easy to detect.
# If the entire function is on one line, we can handle that
# too, although this should be limited to inlines.
# If the entire function is on one line, handle that too.
if function_sig.endswith('{'):
start_line = line_no
state = ReaderState.IN_FUNC
@ -122,18 +139,4 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
end_line = line_no
state = ReaderState.FUNCTION_DONE
elif state == ReaderState.WANT_OFFSET:
# If we detected an offset marker unexpectedly, we are handling
# it here so we can continue seeking.
can_seek = True
match = match_offset_comment(line)
if match is not None:
offset_match = match
offset_comment = line.strip()
if match.is_template:
state = ReaderState.IN_TEMPLATE
else:
state = ReaderState.WANT_SIG
return blocks

View File

@ -1,5 +1,5 @@
# C++ Parser utility functions and data structures
from __future__ import annotations # python <3.10 compatibility
from __future__ import annotations # python <3.10 compatibility
import re
from collections import namedtuple
@ -8,8 +8,8 @@
['offset', 'signature', 'start_line', 'end_line',
'offset_comment', 'module', 'is_template', 'is_stub'])
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address',
'is_template', 'is_stub'])
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
'is_stub', 'comment'])
# This has not been formally established, but considering that "STUB"
# is a temporary state for a function, we assume it will appear last,
@ -74,4 +74,23 @@ def match_offset_comment(line: str) -> OffsetMatch | None:
return OffsetMatch(module=match.group(1),
address=int(match.group(2), 16),
is_template=match.group(3) is not None,
is_stub=match.group(4) is not None)
is_stub=match.group(4) is not None,
comment=line.strip())
def distinct_module(offsets: [OffsetMatch]) -> [OffsetMatch]:
"""Given a list of offset markers, return a list with distinct
module names. If module names (case-insensitive) are repeated,
choose the offset that appears first."""
if len(offsets) < 2:
return offsets
# Dict maintains insertion order in python >=3.7
offsets_dict = {}
for offset in offsets:
module_upper = offset.module.upper()
if module_upper not in offsets_dict:
offsets_dict[module_upper] = offset
return list(offsets_dict.values())

View File

@ -0,0 +1,25 @@
// Sample for python unit tests
// Not part of the decomp
// Handling multiple offset markers
// OFFSET: TEST 0x1234
// OFFSET: HELLO 0x5555
void different_modules()
{
// TODO
}
// OFFSET: TEST 0x2345
// OFFSET: TEST 0x1234
void same_module()
{
// TODO
}
// OFFSET: TEST 0x2002
// OFFSET: test 0x1001
void same_case_insensitive()
{
// TODO
}

View File

@ -89,11 +89,12 @@ def test_indented():
assert len(blocks) == 2
assert blocks[0].offset == int('0x12345678', 16)
assert blocks[0].start_line == 15
#assert blocks[0].end_line == 18
# assert blocks[0].end_line == 18
assert blocks[1].offset == int('0xdeadbeef', 16)
assert blocks[1].start_line == 22
#assert blocks[1].end_line == 24
# assert blocks[1].end_line == 24
def test_inline():
with sample_file('inline.cpp') as f:
@ -103,3 +104,25 @@ def test_inline():
for block in blocks:
assert block.start_line is not None
assert block.start_line == block.end_line
def test_multiple_offsets():
"""If multiple offset marks appear before for a code block, take them
all but ensure module name (case-insensitive) is distinct.
Use first module occurrence in case of duplicates."""
with sample_file('multiple_offsets.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 4
assert blocks[0].module == 'TEST'
assert blocks[0].start_line == 9
assert blocks[1].module == 'HELLO'
assert blocks[1].start_line == 9
# Duplicate modules are ignored
assert blocks[2].start_line == 16
assert blocks[2].offset == 0x2345
assert blocks[3].module == 'TEST'
assert blocks[3].offset == 0x2002

View File

@ -1,8 +1,10 @@
import pytest
from collections import namedtuple
from isledecomp.parser.util import (
is_blank_or_comment,
match_offset_comment,
is_exact_offset_comment,
distinct_module,
)
@ -85,3 +87,26 @@ def test_offset_match(line: str, match: bool, exact):
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
def test_exact_offset_comment(line: str, exact: bool, match):
assert is_exact_offset_comment(line) is exact
# Helper for the next test: cut down version of OffsetMatch
MiniOfs = namedtuple('MiniOfs', ['module', 'value'])
distinct_module_samples = [
# empty set
([], []),
# same module name
([MiniOfs('TEST', 123), MiniOfs('TEST', 555)],
[MiniOfs('TEST', 123)]),
# same module name, case-insensitive
([MiniOfs('test', 123), MiniOfs('TEST', 555)],
[MiniOfs('test', 123)]),
# duplicates, non-consecutive
([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)],
[MiniOfs('test', 123), MiniOfs('abc', 111)]),
]
@pytest.mark.parametrize('sample, expected', distinct_module_samples)
def test_distinct_module(sample: [MiniOfs], expected: [MiniOfs]):
assert distinct_module(sample) == expected