From abefacf9b2afb689a1e29bf1249f7307f6ef3aa0 Mon Sep 17 00:00:00 2001 From: disinvite Date: Tue, 16 Jan 2024 19:02:18 -0500 Subject: [PATCH] Enable string annotations and indirect globals --- tools/isledecomp/isledecomp/compare/core.py | 28 +++- tools/isledecomp/isledecomp/compare/db.py | 5 +- .../isledecomp/isledecomp/cvdump/analysis.py | 6 +- .../isledecomp/isledecomp/cvdump/demangler.py | 11 +- .../isledecomp/isledecomp/parser/codebase.py | 4 + tools/isledecomp/isledecomp/parser/error.py | 4 + tools/isledecomp/isledecomp/parser/linter.py | 33 ++++- tools/isledecomp/isledecomp/parser/marker.py | 30 +++++ tools/isledecomp/isledecomp/parser/node.py | 5 + tools/isledecomp/isledecomp/parser/parser.py | 121 ++++++++++-------- tools/isledecomp/isledecomp/parser/util.py | 22 ++++ tools/isledecomp/tests/test_demangler.py | 1 + tools/isledecomp/tests/test_linter.py | 30 +++++ tools/isledecomp/tests/test_parser.py | 79 ++++++++++++ .../tests/test_parser_statechange.py | 8 +- tools/isledecomp/tests/test_parser_util.py | 16 +++ 16 files changed, 332 insertions(+), 71 deletions(-) diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index 40b93bae..c8ae7809 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -85,13 +85,19 @@ def _load_cvdump(self): if sym.node_type == SymbolType.STRING: string_info = demangle_string_const(sym.decorated_name) + if string_info is None: + logger.debug( + "Could not demangle string symbol: %s", sym.decorated_name + ) + continue + # TODO: skip unicode for now. will need to handle these differently. if string_info.is_utf16: continue raw = self.recomp_bin.read(addr, sym.size()) try: - sym.friendly_name = raw.decode("latin1") + sym.friendly_name = raw.decode("latin1").rstrip("\x00") except UnicodeDecodeError: pass @@ -134,6 +140,26 @@ def _load_markers(self): for tbl in codebase.iter_vtables(): self._db.match_vtable(tbl.offset, tbl.name) + for string in codebase.iter_strings(): + # Not that we don't trust you, but we're checking the string + # annotation to make sure it is accurate. + try: + # TODO: would presumably fail for wchar_t strings + orig = self.orig_bin.read_string(string.offset).decode("latin1") + string_correct = string.name == orig + except UnicodeDecodeError: + string_correct = False + + if not string_correct: + logger.error( + "Data at 0x%x does not match string %s", + string.offset, + repr(string.name), + ) + continue + + self._db.match_string(string.offset, string.name) + def _find_original_strings(self): """Go to the original binary and look for the specified string constants to find a match. This is a (relatively) expensive operation so we only diff --git a/tools/isledecomp/isledecomp/compare/db.py b/tools/isledecomp/isledecomp/compare/db.py index 8cba0c03..d2c2cbf9 100644 --- a/tools/isledecomp/isledecomp/compare/db.py +++ b/tools/isledecomp/isledecomp/compare/db.py @@ -43,7 +43,8 @@ def match_name(self) -> str: return None ctype = self.compare_type.name if self.compare_type is not None else "UNK" - return f"{self.name} ({ctype})" + name = repr(self.name) if ctype == "STRING" else self.name + return f"{name} ({ctype})" def matchinfo_factory(_, row): @@ -197,3 +198,5 @@ def match_string(self, addr: int, value: str) -> bool: if not did_match: escaped = repr(value) logger.error("Failed to find string: %s", escaped) + + return did_match diff --git a/tools/isledecomp/isledecomp/cvdump/analysis.py b/tools/isledecomp/isledecomp/cvdump/analysis.py index 720593be..330429dd 100644 --- a/tools/isledecomp/isledecomp/cvdump/analysis.py +++ b/tools/isledecomp/isledecomp/cvdump/analysis.py @@ -94,7 +94,11 @@ def set_decorated(self, name: str): def name(self) -> Optional[str]: """Prefer "friendly" name if we have it. This is what we have been using to match functions.""" - return self.friendly_name or self.decorated_name + return ( + self.friendly_name + if self.friendly_name is not None + else self.decorated_name + ) def size(self) -> Optional[int]: if self.confirmed_size is not None: diff --git a/tools/isledecomp/isledecomp/cvdump/demangler.py b/tools/isledecomp/isledecomp/cvdump/demangler.py index 340f19a3..07fca42f 100644 --- a/tools/isledecomp/isledecomp/cvdump/demangler.py +++ b/tools/isledecomp/isledecomp/cvdump/demangler.py @@ -4,6 +4,7 @@ """ import re from collections import namedtuple +from typing import Optional class InvalidEncodedNumberError(Exception): @@ -30,13 +31,12 @@ def parse_encoded_number(string: str) -> int: StringConstInfo = namedtuple("StringConstInfo", "len is_utf16") -def demangle_string_const(symbol: str) -> StringConstInfo: +def demangle_string_const(symbol: str) -> Optional[StringConstInfo]: """Don't bother to decode the string text from the symbol. We can just read it from the binary once we have the length.""" match = string_const_regex.match(symbol) if match is None: - # See below - return StringConstInfo(0, False) + return None try: strlen = ( @@ -45,10 +45,7 @@ def demangle_string_const(symbol: str) -> StringConstInfo: else int(match.group("len")) ) except (ValueError, InvalidEncodedNumberError): - # This would be an annoying error to fail on if we get a bad symbol. - # For now, just assume a zero length string because this will probably - # raise some eyebrows during the comparison. - strlen = 0 + return None is_utf16 = match.group("is_utf16") == "1" return StringConstInfo(len=strlen, is_utf16=is_utf16) diff --git a/tools/isledecomp/isledecomp/parser/codebase.py b/tools/isledecomp/isledecomp/parser/codebase.py index 54ab4035..184b1256 100644 --- a/tools/isledecomp/isledecomp/parser/codebase.py +++ b/tools/isledecomp/isledecomp/parser/codebase.py @@ -6,6 +6,7 @@ ParserFunction, ParserVtable, ParserVariable, + ParserString, ) @@ -42,3 +43,6 @@ def iter_vtables(self) -> Iterator[ParserVtable]: def iter_variables(self) -> Iterator[ParserVariable]: return filter(lambda s: isinstance(s, ParserVariable), self._symbols) + + def iter_strings(self) -> Iterator[ParserString]: + return filter(lambda s: isinstance(s, ParserString), self._symbols) diff --git a/tools/isledecomp/isledecomp/parser/error.py b/tools/isledecomp/isledecomp/parser/error.py index eb1aa7b8..81123381 100644 --- a/tools/isledecomp/isledecomp/parser/error.py +++ b/tools/isledecomp/isledecomp/parser/error.py @@ -70,6 +70,10 @@ class ParserError(Enum): # a comment -- i.e. VTABLE or GLOBAL -- could not extract the name NO_SUITABLE_NAME = 204 + # ERROR: Two STRING markers have the same module and offset, but the strings + # they annotate are different. + WRONG_STRING = 205 + @dataclass class ParserAlert: diff --git a/tools/isledecomp/isledecomp/parser/linter.py b/tools/isledecomp/isledecomp/parser/linter.py index 54406ce9..b44487df 100644 --- a/tools/isledecomp/isledecomp/parser/linter.py +++ b/tools/isledecomp/isledecomp/parser/linter.py @@ -1,7 +1,7 @@ from typing import List, Optional from .parser import DecompParser from .error import ParserAlert, ParserError -from .node import ParserSymbol +from .node import ParserSymbol, ParserString def get_checkorder_filter(module): @@ -19,6 +19,9 @@ def __init__(self) -> None: # This is _not_ reset between files and is intended to report offset reuse # when scanning the entire directory. self._offsets_used = set() + # Keep track of strings we have seen. Persists across files. + # Module/offset can be repeated for string markers but the strings must match. + self._strings = {} def reset(self, full_reset: bool = False): self.alerts = [] @@ -28,6 +31,7 @@ def reset(self, full_reset: bool = False): if full_reset: self._offsets_used.clear() + self._strings = {} def file_is_header(self): return self._filename.lower().endswith(".h") @@ -36,17 +40,31 @@ def _load_offsets_from_list(self, marker_list: List[ParserSymbol]): """Helper for loading (module, offset) tuples while the DecompParser has them broken up into three different lists.""" for marker in marker_list: + is_string = isinstance(marker, ParserString) + value = (marker.module, marker.offset) if value in self._offsets_used: - self.alerts.append( - ParserAlert( - code=ParserError.DUPLICATE_OFFSET, - line_number=marker.line_number, - line=f"0x{marker.offset:08x}", + if is_string: + if self._strings[value] != marker.name: + self.alerts.append( + ParserAlert( + code=ParserError.WRONG_STRING, + line_number=marker.line_number, + line=f"0x{marker.offset:08x}, {repr(self._strings[value])} vs. {repr(marker.name)}", + ) + ) + else: + self.alerts.append( + ParserAlert( + code=ParserError.DUPLICATE_OFFSET, + line_number=marker.line_number, + line=f"0x{marker.offset:08x}", + ) ) - ) else: self._offsets_used.add(value) + if is_string: + self._strings[value] = marker.name def _check_function_order(self): """Rules: @@ -82,6 +100,7 @@ def _check_offset_uniqueness(self): self._load_offsets_from_list(self._parser.functions) self._load_offsets_from_list(self._parser.vtables) self._load_offsets_from_list(self._parser.variables) + self._load_offsets_from_list(self._parser.strings) def _check_byname_allowed(self): if self.file_is_header(): diff --git a/tools/isledecomp/isledecomp/parser/marker.py b/tools/isledecomp/isledecomp/parser/marker.py index d2e181dd..de8c6f05 100644 --- a/tools/isledecomp/isledecomp/parser/marker.py +++ b/tools/isledecomp/isledecomp/parser/marker.py @@ -3,6 +3,19 @@ from enum import Enum +class MarkerCategory(Enum): + """For the purposes of grouping multiple different DecompMarkers together, + assign a rough "category" for the MarkerType values below. + It's really only the function types that have to get folded down, but + we'll do that in a structured way to permit future expansion.""" + + FUNCTION = 1 + VARIABLE = 2 + STRING = 3 + VTABLE = 4 + ADDRESS = 100 # i.e. no comparison required or possible + + class MarkerType(Enum): UNKNOWN = -100 FUNCTION = 1 @@ -51,6 +64,23 @@ def module(self) -> str: def offset(self) -> int: return self._offset + @property + def category(self) -> MarkerCategory: + if self.is_vtable(): + return MarkerCategory.VTABLE + + if self.is_variable(): + return MarkerCategory.VARIABLE + + if self.is_string(): + return MarkerCategory.STRING + + # TODO: worth another look if we add more types, but this covers it + if self.is_regular_function() or self.is_explicit_byname(): + return MarkerCategory.FUNCTION + + return MarkerCategory.ADDRESS + def is_regular_function(self) -> bool: """Regular function, meaning: not an explicit byname lookup. FUNCTION markers can be _implicit_ byname. diff --git a/tools/isledecomp/isledecomp/parser/node.py b/tools/isledecomp/isledecomp/parser/node.py index b6561fd6..71d4ebdd 100644 --- a/tools/isledecomp/isledecomp/parser/node.py +++ b/tools/isledecomp/isledecomp/parser/node.py @@ -55,3 +55,8 @@ class ParserVariable(ParserSymbol): @dataclass class ParserVtable(ParserSymbol): pass + + +@dataclass +class ParserString(ParserSymbol): + pass diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py index d4153b3c..3c955fc3 100644 --- a/tools/isledecomp/isledecomp/parser/parser.py +++ b/tools/isledecomp/isledecomp/parser/parser.py @@ -3,11 +3,11 @@ from typing import List, Iterable, Iterator, Optional from enum import Enum from .util import ( - is_blank_or_comment, get_class_name, get_variable_name, get_synthetic_name, remove_trailing_comment, + get_string_contents, ) from .marker import ( DecompMarker, @@ -19,6 +19,7 @@ ParserFunction, ParserVariable, ParserVtable, + ParserString, ) from .error import ParserAlert, ParserError @@ -43,17 +44,16 @@ def __init__(self) -> None: def insert(self, marker: DecompMarker) -> bool: """Return True if this insert would overwrite""" - module = marker.module - if module in self.markers: + key = (marker.category, marker.module) + if key in self.markers: return True - # TODO: type converted back to string version here instead of using enum - self.markers[module] = (marker.type.name, marker.offset) + self.markers[key] = marker return False def iter(self) -> Iterator[DecompMarker]: - for module, (marker_type, offset) in self.markers.items(): - yield DecompMarker(marker_type, module, offset) + for _, marker in self.markers.items(): + yield marker def empty(self): self.markers = {} @@ -111,17 +111,21 @@ def reset(self): self.function_sig = "" @property - def functions(self) -> List[ParserSymbol]: + def functions(self) -> List[ParserFunction]: return [s for s in self._symbols if isinstance(s, ParserFunction)] @property - def vtables(self) -> List[ParserSymbol]: + def vtables(self) -> List[ParserVtable]: return [s for s in self._symbols if isinstance(s, ParserVtable)] @property - def variables(self) -> List[ParserSymbol]: + def variables(self) -> List[ParserVariable]: return [s for s in self._symbols if isinstance(s, ParserVariable)] + @property + def strings(self) -> List[ParserString]: + return [s for s in self._symbols if isinstance(s, ParserString)] + def iter_symbols(self, module: Optional[str] = None) -> Iterator[ParserSymbol]: for s in self._symbols: if module is None or s.module == module: @@ -225,21 +229,35 @@ def _variable_marker(self, marker: DecompMarker): else: self.state = ReaderState.IN_GLOBAL - def _variable_done(self, name: str): - if not name.startswith("g_"): - self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX) + def _variable_done( + self, variable_name: Optional[str] = None, string_value: Optional[str] = None + ): + if variable_name is None and string_value is None: + self._syntax_error(ParserError.NO_SUITABLE_NAME) + return for marker in self.var_markers.iter(): - self._symbols.append( - ParserVariable( - type=marker.type, - line_number=self.line_number, - module=marker.module, - offset=marker.offset, - name=name, - is_static=self.state == ReaderState.IN_FUNC_GLOBAL, + if marker.is_string(): + self._symbols.append( + ParserString( + type=marker.type, + line_number=self.line_number, + module=marker.module, + offset=marker.offset, + name=string_value, + ) + ) + else: + self._symbols.append( + ParserVariable( + type=marker.type, + line_number=self.line_number, + module=marker.module, + offset=marker.offset, + name=variable_name, + is_static=self.state == ReaderState.IN_FUNC_GLOBAL, + ) ) - ) self.var_markers.empty() if self.state == ReaderState.IN_FUNC_GLOBAL: @@ -298,20 +316,8 @@ def _handle_marker(self, marker: DecompMarker): else: self._syntax_error(ParserError.INCOMPATIBLE_MARKER) - elif marker.is_string(): - # TODO: We are ignoring string markers for the moment. - # We already have a lot of them in the codebase, though, so we'll - # hang onto them for now in case we can use them later. - # To match up string constants, the strategy will be: - # 1. Use cvdump to find all string constants in the recomp - # 2. In the original binary, look at relocated vaddrs from .rdata - # 3. Try to match up string data from #1 with locations in #2 - - # Throw the syntax error we would throw if we were parsing these - if self.state not in (ReaderState.SEARCH, ReaderState.IN_FUNC): - self._syntax_error(ParserError.INCOMPATIBLE_MARKER) - - elif marker.is_variable(): + # Strings and variables are almost the same thing + elif marker.is_string() or marker.is_variable(): if self.state in ( ReaderState.SEARCH, ReaderState.IN_GLOBAL, @@ -418,24 +424,39 @@ def read_line(self, line: str): # function we have already parsed if state == IN_FUNC_GLOBAL. # However, we are not tolerant of _any_ syntax problems in our # CI actions, so the solution is to just fix the invalid marker. - if is_blank_or_comment(line): - self._syntax_error(ParserError.NO_SUITABLE_NAME) + variable_name = None + + global_markers_queued = any( + m.is_variable() for m in self.var_markers.iter() + ) + + if len(line_strip) == 0: + self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE) return - # We don't have a foolproof mechanism to tell what is and is not a variable. - # If the GLOBAL is being declared on a `return` statement, though, this is - # not correct. It is either a string literal (which will be handled differently) - # or it is not the variable declaration, which is incorrect decomp syntax. - if line.strip().startswith("return"): - self._syntax_error(ParserError.GLOBAL_NOT_VARIABLE) - return + if global_markers_queued: + # Not the greatest solution, but a consequence of combining GLOBAL and + # STRING markers together. If the marker precedes a return statement, it is + # valid for a STRING marker to be here, but not a GLOBAL. We need to look + # ahead and tell whether this *would* fail. + if line_strip.startswith("return"): + self._syntax_error(ParserError.GLOBAL_NOT_VARIABLE) + return + if line_strip.startswith("//"): + # If we found a comment, assume implicit lookup-by-name + # function and end here. We know this is not a decomp marker + # because it would have been handled already. + variable_name = get_synthetic_name(line) + else: + variable_name = get_variable_name(line) + # This is out of our control for library variables, but all of our + # variables should start with "g_". + if variable_name is not None and not variable_name.startswith("g_"): + self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX) - name = get_variable_name(line) - if name is None: - self._syntax_error(ParserError.NO_SUITABLE_NAME) - return + string_name = get_string_contents(line) - self._variable_done(name) + self._variable_done(variable_name, string_name) elif self.state == ReaderState.IN_VTABLE: vtable_class = get_class_name(line) diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py index a106d841..dd90fd03 100644 --- a/tools/isledecomp/isledecomp/parser/util.py +++ b/tools/isledecomp/isledecomp/parser/util.py @@ -1,6 +1,7 @@ # C++ Parser utility functions and data structures import re from typing import Optional +from ast import literal_eval # The goal here is to just read whatever is on the next line, so some # flexibility in the formatting seems OK @@ -12,6 +13,10 @@ trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$") +# Get string contents, ignore escape characters that might interfere +doubleQuoteRegex = re.compile(r"(\"(?:[^\"\\]|\\.)*\")") + + def get_synthetic_name(line: str) -> Optional[str]: """Synthetic names appear on a single line comment on the line after the marker. If that's not what we have, return None""" @@ -86,3 +91,20 @@ def get_variable_name(line: str) -> Optional[str]: return match.group("name") return None + + +def get_string_contents(line: str) -> Optional[str]: + """Return the first C string seen on this line. + We have to unescape the string, and a simple way to do that is to use + python's ast.literal_eval. I'm sure there are many pitfalls to doing + it this way, but hopefully the regex will ensure reasonably sane input.""" + + try: + if (match := doubleQuoteRegex.search(line)) is not None: + return literal_eval(match.group(1)) + # pylint: disable=broad-exception-caught + # No way to predict what kind of exception could occur. + except Exception: + pass + + return None diff --git a/tools/isledecomp/tests/test_demangler.py b/tools/isledecomp/tests/test_demangler.py index 5343bdcc..32cac502 100644 --- a/tools/isledecomp/tests/test_demangler.py +++ b/tools/isledecomp/tests/test_demangler.py @@ -14,6 +14,7 @@ 14, True, ), + ("??_C@_00A@?$AA@", 0, False), ] diff --git a/tools/isledecomp/tests/test_linter.py b/tools/isledecomp/tests/test_linter.py index 84ce6495..95d19515 100644 --- a/tools/isledecomp/tests/test_linter.py +++ b/tools/isledecomp/tests/test_linter.py @@ -112,3 +112,33 @@ def test_duplicate_offsets(linter): # Full reset will forget seen offsets. linter.reset(True) assert linter.check_lines(lines, "test.h", "TEST") is True + + +def test_duplicate_strings(linter): + """Duplicate string markers are okay if the string value is the same.""" + string_lines = [ + "// STRING: TEST 0x1000", + 'return "hello world";', + ] + + # No problem to use this marker twice. + assert linter.check_lines(string_lines, "test.h", "TEST") is True + assert linter.check_lines(string_lines, "test.h", "TEST") is True + + different_string = [ + "// STRING: TEST 0x1000", + 'return "hi there";', + ] + + # Same address but the string is different + assert linter.check_lines(different_string, "greeting.h", "TEST") is False + assert len(linter.alerts) == 1 + assert linter.alerts[0].code == ParserError.WRONG_STRING + + same_addr_reused = [ + "// GLOBAL:TEXT 0x1000", + "int g_test = 123;", + ] + + # This will fail like any other offset reuse. + assert linter.check_lines(same_addr_reused, "other.h", "TEST") is False diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py index 853e773d..6d8b72b1 100644 --- a/tools/isledecomp/tests/test_parser.py +++ b/tools/isledecomp/tests/test_parser.py @@ -442,3 +442,82 @@ def test_static_variable(parser): ) assert len(parser.variables) == 2 assert parser.variables[1].is_static is True + + +def test_reject_global_return(parser): + """Previously we had annotated strings with the GLOBAL marker. + For example: if a function returned a string. We now want these to be + annotated with the STRING marker.""" + + parser.read_lines( + [ + "// FUNCTION: TEST 0x5555", + "void test_function() {", + " // GLOBAL: TEST 0x8888", + ' return "test";', + "}", + ] + ) + assert len(parser.variables) == 0 + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.GLOBAL_NOT_VARIABLE + + +def test_global_string(parser): + """We now allow GLOBAL and STRING markers for the same item.""" + + parser.read_lines( + [ + "// GLOBAL: TEST 0x1234", + "// STRING: TEXT 0x5555", + 'char* g_test = "hello";', + ] + ) + assert len(parser.variables) == 1 + assert len(parser.strings) == 1 + assert len(parser.alerts) == 0 + + assert parser.variables[0].name == "g_test" + assert parser.strings[0].name == "hello" + + +def test_comment_variables(parser): + """Match on hidden variables from libraries.""" + + parser.read_lines( + [ + "// GLOBAL: TEST 0x1234", + "// g_test", + ] + ) + assert len(parser.variables) == 1 + assert parser.variables[0].name == "g_test" + + +def test_flexible_variable_prefix(parser): + """Don't alert to library variables that lack the g_ prefix. + This is out of our control.""" + + parser.read_lines( + [ + "// GLOBAL: TEST 0x1234", + "// some_other_variable", + ] + ) + assert len(parser.variables) == 1 + assert len(parser.alerts) == 0 + assert parser.variables[0].name == "some_other_variable" + + +def test_string_ignore_g_prefix(parser): + """String annotations above a regular variable should not alert to + the missing g_ prefix. This is only required for GLOBAL markers.""" + + parser.read_lines( + [ + "// STRING: TEST 0x1234", + 'const char* value = "";', + ] + ) + assert len(parser.strings) == 1 + assert len(parser.alerts) == 0 diff --git a/tools/isledecomp/tests/test_parser_statechange.py b/tools/isledecomp/tests/test_parser_statechange.py index cb54cf0a..be37e3c8 100644 --- a/tools/isledecomp/tests/test_parser_statechange.py +++ b/tools/isledecomp/tests/test_parser_statechange.py @@ -15,7 +15,7 @@ (_rs.SEARCH, "TEMPLATE", _rs.IN_TEMPLATE, None), (_rs.SEARCH, "VTABLE", _rs.IN_VTABLE, None), (_rs.SEARCH, "LIBRARY", _rs.IN_LIBRARY, None), - (_rs.SEARCH, "STRING", _rs.SEARCH, None), + (_rs.SEARCH, "STRING", _rs.IN_GLOBAL, None), (_rs.WANT_SIG, "FUNCTION", _rs.WANT_SIG, None), (_rs.WANT_SIG, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), @@ -33,7 +33,7 @@ (_rs.IN_FUNC, "TEMPLATE", _rs.IN_TEMPLATE, _pe.MISSED_END_OF_FUNCTION), (_rs.IN_FUNC, "VTABLE", _rs.IN_VTABLE, _pe.MISSED_END_OF_FUNCTION), (_rs.IN_FUNC, "LIBRARY", _rs.IN_LIBRARY, _pe.MISSED_END_OF_FUNCTION), - (_rs.IN_FUNC, "STRING", _rs.IN_FUNC, None), + (_rs.IN_FUNC, "STRING", _rs.IN_FUNC_GLOBAL, None), (_rs.IN_TEMPLATE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), (_rs.IN_TEMPLATE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), @@ -60,7 +60,7 @@ (_rs.IN_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), (_rs.IN_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), (_rs.IN_GLOBAL, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), - (_rs.IN_GLOBAL, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_GLOBAL, "STRING", _rs.IN_GLOBAL, None), (_rs.IN_FUNC_GLOBAL, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), (_rs.IN_FUNC_GLOBAL, "GLOBAL", _rs.IN_FUNC_GLOBAL, None), @@ -69,7 +69,7 @@ (_rs.IN_FUNC_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), (_rs.IN_FUNC_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), (_rs.IN_FUNC_GLOBAL, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), - (_rs.IN_FUNC_GLOBAL, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_FUNC_GLOBAL, "STRING", _rs.IN_FUNC_GLOBAL, None), (_rs.IN_VTABLE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), (_rs.IN_VTABLE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py index af102fc2..8a403710 100644 --- a/tools/isledecomp/tests/test_parser_util.py +++ b/tools/isledecomp/tests/test_parser_util.py @@ -10,6 +10,7 @@ is_blank_or_comment, get_class_name, get_variable_name, + get_string_contents, ) @@ -158,3 +159,18 @@ def test_get_class_name_none(line: str): @pytest.mark.parametrize("line,name", variable_name_cases) def test_get_variable_name(line: str, name: str): assert get_variable_name(line) == name + + +string_match_cases = [ + ('return "hello world";', "hello world"), + ('"hello\\\\"', "hello\\"), + ('"hello \\"world\\""', 'hello "world"'), + ('"hello\\nworld"', "hello\nworld"), + # Only match first string if there are multiple options + ('Method("hello", "world");', "hello"), +] + + +@pytest.mark.parametrize("line, string", string_match_cases) +def test_get_string_contents(line: str, string: str): + assert get_string_contents(line) == string