From a8f6e72b978ac664dacfb009212ebae5a0f5038e Mon Sep 17 00:00:00 2001 From: jonschz Date: Sun, 26 May 2024 21:58:19 +0200 Subject: [PATCH] Implement struct imports [skip ci] - This code is still in dire need of refactoring and tests - There are only single-digit issues left, and 2600 functions can be imported - The biggest remaining error is mismatched stacks --- .../import_functions_from_pdb.py | 66 +++-- tools/ghidra_scripts/lego_util/exceptions.py | 15 +- ...{pdb_to_ghidra.py => function_importer.py} | 36 ++- .../ghidra_scripts/lego_util/ghidra_helper.py | 58 +++- .../lego_util/pdb_extraction.py | 36 ++- .../ghidra_scripts/lego_util/type_importer.py | 260 ++++++++++++++++++ tools/isledecomp/isledecomp/compare/core.py | 10 +- tools/isledecomp/isledecomp/cvdump/types.py | 26 +- 8 files changed, 448 insertions(+), 59 deletions(-) rename tools/ghidra_scripts/lego_util/{pdb_to_ghidra.py => function_importer.py} (88%) create mode 100644 tools/ghidra_scripts/lego_util/type_importer.py diff --git a/tools/ghidra_scripts/import_functions_from_pdb.py b/tools/ghidra_scripts/import_functions_from_pdb.py index 3eb9405d..733936d0 100644 --- a/tools/ghidra_scripts/import_functions_from_pdb.py +++ b/tools/ghidra_scripts/import_functions_from_pdb.py @@ -15,6 +15,7 @@ import importlib from dataclasses import dataclass, field +import logging.handlers import sys import logging from pathlib import Path @@ -44,18 +45,25 @@ def reload_module(module: str): def setup_logging(): - logging.basicConfig( - format="%(levelname)-8s %(message)s", - stream=sys.stdout, - level=logging.INFO, - force=True, + logging.root.handlers.clear() + formatter = logging.Formatter("%(levelname)-8s %(message)s") + # formatter = logging.Formatter("%(name)s %(levelname)-8s %(message)s") # use this to identify loggers + stdout_handler = logging.StreamHandler(sys.stdout) + stdout_handler.setFormatter(formatter) + file_handler = logging.FileHandler( + Path(__file__).absolute().parent.joinpath("import.log"), mode="w" ) + file_handler.setFormatter(formatter) + logging.root.setLevel(GLOBALS.loglevel) + logging.root.addHandler(stdout_handler) + logging.root.addHandler(file_handler) logger.info("Starting...") @dataclass class Globals: verbose: bool + loglevel: int running_from_ghidra: bool = False make_changes: bool = False prompt_before_changes: bool = True @@ -64,7 +72,11 @@ class Globals: # hard-coded settings that we don't want to prompt in Ghidra every time -GLOBALS = Globals(verbose=False) +GLOBALS = Globals( + verbose=False, + # loglevel=logging.INFO, + loglevel=logging.DEBUG, +) # Disable spurious warnings in vscode / pylance @@ -111,14 +123,19 @@ def add_python_path(path: str): # We need to quote the types here because they might not exist when running without Ghidra def migrate_function_to_ghidra( - api: "FlatProgramAPI", match_info: "MatchInfo", signature: "FunctionSignature" + api: "FlatProgramAPI", + match_info: "MatchInfo", + signature: "FunctionSignature", + type_importer: "PdbTypeImporter", ): hex_original_address = f"{match_info.orig_addr:x}" # Find the Ghidra function at that address ghidra_address = getAddressFactory().getAddress(hex_original_address) - typed_pdb_function = PdbFunctionWithGhidraObjects(api, match_info, signature) + typed_pdb_function = PdbFunctionWithGhidraObjects( + api, match_info, signature, type_importer + ) if not GLOBALS.make_changes: return @@ -170,19 +187,20 @@ def migrate_function_to_ghidra( askChoice("Continue", "Click 'OK' to continue", ["OK"], "OK") -def process_functions(isle_compare: "IsleCompare"): - # try to acquire matched functions - migration = PdbExtractionForGhidraMigration(isle_compare) - func_signatures = migration.get_function_list() +def process_functions(extraction: "PdbExtractionForGhidraMigration"): + func_signatures = extraction.get_function_list() if not GLOBALS.running_from_ghidra: logger.info("Completed the dry run outside Ghidra.") return - fpapi = FlatProgramAPI(currentProgram()) + api = FlatProgramAPI(currentProgram()) + # TODO: Implement a "no changes" mode + type_importer = PdbTypeImporter(api, extraction) + for match_info, signature in func_signatures: try: - migrate_function_to_ghidra(fpapi, match_info, signature) + migrate_function_to_ghidra(api, match_info, signature, type_importer) GLOBALS.statistics.successes += 1 except Lego1Exception as e: log_and_track_failure(e) @@ -216,8 +234,11 @@ def main(): pdb_path = build_path.joinpath("LEGO1.pdb") if not GLOBALS.verbose: - logging.getLogger("isledecomp.compare.db").setLevel(logging.CRITICAL) - logging.getLogger("isledecomp.compare.lines").setLevel(logging.CRITICAL) + logging.getLogger("isledecomp.bin").setLevel(logging.WARNING) + logging.getLogger("isledecomp.compare.core").setLevel(logging.WARNING) + logging.getLogger("isledecomp.compare.db").setLevel(logging.WARNING) + logging.getLogger("isledecomp.compare.lines").setLevel(logging.WARNING) + logging.getLogger("isledecomp.cvdump.symbols").setLevel(logging.WARNING) logger.info("Starting comparison") with Bin(str(origfile_path), find_str=True) as origfile, Bin( @@ -227,8 +248,10 @@ def main(): logger.info("Comparison complete.") + # try to acquire matched functions + migration = PdbExtractionForGhidraMigration(isle_compare) try: - process_functions(isle_compare) + process_functions(migration) finally: if GLOBALS.running_from_ghidra: GLOBALS.statistics.log() @@ -265,8 +288,13 @@ def main(): ) if GLOBALS.running_from_ghidra: - reload_module("lego_util.pdb_to_ghidra") - from lego_util.pdb_to_ghidra import PdbFunctionWithGhidraObjects + reload_module("lego_util.ghidra_helper") + + reload_module("lego_util.function_importer") + from lego_util.function_importer import PdbFunctionWithGhidraObjects + + reload_module("lego_util.type_importer") + from lego_util.type_importer import PdbTypeImporter if __name__ == "__main__": main() diff --git a/tools/ghidra_scripts/lego_util/exceptions.py b/tools/ghidra_scripts/lego_util/exceptions.py index f120e717..e44c10f2 100644 --- a/tools/ghidra_scripts/lego_util/exceptions.py +++ b/tools/ghidra_scripts/lego_util/exceptions.py @@ -1,5 +1,13 @@ class Lego1Exception(Exception): - pass + """ + Our own base class for exceptions. + Makes it easier to distinguish expected and unexpected errors. + """ + + +class TypeNotFoundError(Lego1Exception): + def __str__(self): + return f"Type not found in PDB: {self.args[0]}" class TypeNotFoundInGhidraError(Lego1Exception): @@ -7,6 +15,11 @@ def __str__(self): return f"Type not found in Ghidra: {self.args[0]}" +class TypeNotImplementedError(Lego1Exception): + def __str__(self): + return f"Import not implemented for type: {self.args[0]}" + + class ClassOrNamespaceNotFoundInGhidraError(Lego1Exception): def __init__(self, namespaceHierachy: list[str]): super().__init__(namespaceHierachy) diff --git a/tools/ghidra_scripts/lego_util/pdb_to_ghidra.py b/tools/ghidra_scripts/lego_util/function_importer.py similarity index 88% rename from tools/ghidra_scripts/lego_util/pdb_to_ghidra.py rename to tools/ghidra_scripts/lego_util/function_importer.py index d312b004..c8f61e41 100644 --- a/tools/ghidra_scripts/lego_util/pdb_to_ghidra.py +++ b/tools/ghidra_scripts/lego_util/function_importer.py @@ -20,9 +20,11 @@ ) from lego_util.ghidra_helper import ( get_ghidra_namespace, - get_ghidra_type, + sanitize_class_name, ) + from lego_util.exceptions import StackOffsetMismatchError +from lego_util.type_importer import PdbTypeImporter logger = logging.getLogger(__name__) @@ -33,26 +35,36 @@ class PdbFunctionWithGhidraObjects: def __init__( self, - fpapi: FlatProgramAPI, + api: FlatProgramAPI, match_info: MatchInfo, signature: FunctionSignature, + type_importer: "PdbTypeImporter", ): - self.api = fpapi + self.api = api self.match_info = match_info self.signature = signature + self.type_importer = type_importer + + if signature.class_type is not None: + # Import the base class so the namespace exists + self.type_importer.pdb_to_ghidra_type(signature.class_type) assert match_info.name is not None - colon_split = match_info.name.split("::") + + colon_split = sanitize_class_name(match_info.name).split("::") self.name = colon_split.pop() namespace_hierachy = colon_split - self.namespace = get_ghidra_namespace(fpapi, namespace_hierachy) + self.namespace = get_ghidra_namespace(api, namespace_hierachy) - self.return_type = get_ghidra_type(fpapi, signature.return_type) + self.return_type = type_importer.pdb_to_ghidra_type( + signature.return_type + ) self.arguments = [ ParameterImpl( f"param{index}", - get_ghidra_type(fpapi, type_name), - fpapi.getCurrentProgram(), + # get_ghidra_type(api, type_name), + type_importer.pdb_to_ghidra_type(type_name), + api.getCurrentProgram(), ) for (index, type_name) in enumerate(signature.arglist) ] @@ -200,7 +212,13 @@ def _rename_stack_parameter(self, param: Parameter): f"Could not find a matching symbol at offset {param.getStackOffset()} in {self.get_full_name()}" ) - if param.getDataType() != get_ghidra_type(self.api, match.data_type): + if match.data_type == "T_NOTYPE(0000)": + logger.warning("Skipping stack parameter of type NOTYPE") + return + + if param.getDataType() != self.type_importer.pdb_to_ghidra_type( + match.data_type + ): logger.error( "Type mismatch for parameter: %s in Ghidra, %s in PDB", param, match ) diff --git a/tools/ghidra_scripts/lego_util/ghidra_helper.py b/tools/ghidra_scripts/lego_util/ghidra_helper.py index 20132ea3..39b7c351 100644 --- a/tools/ghidra_scripts/lego_util/ghidra_helper.py +++ b/tools/ghidra_scripts/lego_util/ghidra_helper.py @@ -16,6 +16,8 @@ from ghidra.program.model.data import DataType from ghidra.program.model.symbol import Namespace +logger = logging.getLogger(__name__) + def get_ghidra_type(api: FlatProgramAPI, type_name: str): """ @@ -44,14 +46,21 @@ def get_ghidra_type(api: FlatProgramAPI, type_name: str): raise MultipleTypesFoundInGhidraError(type_name, result) -def add_pointer_type(api: FlatProgramAPI, pointee: DataType): - data_type = PointerDataType(pointee) - data_type.setCategoryPath(pointee.getCategoryPath()) - api.getCurrentProgram().getDataTypeManager().addDataType( - data_type, DataTypeConflictHandler.KEEP_HANDLER +def add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType: + new_data_type = PointerDataType(pointee) + new_data_type.setCategoryPath(pointee.getCategoryPath()) + result_data_type = ( + api.getCurrentProgram() + .getDataTypeManager() + .addDataType(new_data_type, DataTypeConflictHandler.KEEP_HANDLER) ) - logging.info("Created new pointer type %s", data_type) - return data_type + if result_data_type is not new_data_type: + logger.debug( + "New pointer replaced by existing one. Fresh pointer: %s (class: %s)", + result_data_type, + result_data_type.__class__, + ) + return result_data_type def get_ghidra_namespace( @@ -63,3 +72,38 @@ def get_ghidra_namespace( if namespace is None: raise ClassOrNamespaceNotFoundInGhidraError(namespace_hierachy) return namespace + + +def create_ghidra_namespace( + api: FlatProgramAPI, namespace_hierachy: list[str] +) -> Namespace: + namespace = api.getCurrentProgram().getGlobalNamespace() + for part in namespace_hierachy: + namespace = api.getNamespace(namespace, part) + if namespace is None: + namespace = api.createNamespace(namespace, part) + return namespace + + +def sanitize_class_name(name: str) -> str: + """ + Takes a full class or function name and replaces characters not accepted by Ghidra. + Applies mostly to templates. + """ + if "<" in name: + new_class_name = ( + "_template_" + + name + .replace("<", "[") + .replace(">", "]") + .replace("*", "#") + .replace(" ", "") + ) + logger.warning( + "Changing possible template class name from '%s' to '%s'", + name, + new_class_name, + ) + return new_class_name + + return name diff --git a/tools/ghidra_scripts/lego_util/pdb_extraction.py b/tools/ghidra_scripts/lego_util/pdb_extraction.py index 7d47c27e..9c884ef4 100644 --- a/tools/ghidra_scripts/lego_util/pdb_extraction.py +++ b/tools/ghidra_scripts/lego_util/pdb_extraction.py @@ -8,13 +8,11 @@ from isledecomp.compare import Compare as IsleCompare from isledecomp.compare.db import MatchInfo +from lego_util.exceptions import TypeNotFoundError + logger = logging.getLogger(__file__) -class TypeNotFoundError(Exception): - pass - - @dataclass class CppStackOrRegisterSymbol: name: str @@ -38,7 +36,7 @@ class FunctionSignature: call_type: str arglist: list[str] return_type: str - class_type: Optional[dict[str, Any]] + class_type: Optional[str] stack_symbols: list[CppStackOrRegisterSymbol] @@ -46,7 +44,7 @@ class PdbExtractionForGhidraMigration: def __init__(self, compare: IsleCompare): self.compare = compare - _scalar_type_regex = re.compile(r"t_(?P\w+)(?:\((?P\d+)\))?") + scalar_type_regex = re.compile(r"t_(?P\w+)(?:\((?P\d+)\))?") _scalar_type_map = { "rchar": "char", @@ -62,10 +60,11 @@ def __init__(self, compare: IsleCompare): "STD Near": "__stdcall", } - def scalar_type_to_cpp(self, scalar_type: str) -> str: + @classmethod + def scalar_type_to_cpp(cls, scalar_type: str) -> str: if scalar_type.startswith("32p"): - return f"{self.scalar_type_to_cpp(scalar_type[3:])} *" - return self._scalar_type_map.get(scalar_type, scalar_type) + return f"{cls.scalar_type_to_cpp(scalar_type[3:])} *" + return cls._scalar_type_map.get(scalar_type, scalar_type) def lookup_type(self, type_name: Optional[str]) -> Optional[dict[str, Any]]: return ( @@ -74,11 +73,12 @@ def lookup_type(self, type_name: Optional[str]) -> Optional[dict[str, Any]]: else self.compare.cv.types.keys.get(type_name.lower()) ) + # TODO: This is mostly legacy code now, we may be able to remove it def type_to_cpp_type_name(self, type_name: str) -> str: # pylint: disable=too-many-return-statements type_lower = type_name.lower() if type_lower.startswith("t_"): - if (match := self._scalar_type_regex.match(type_lower)) is None: + if (match := self.scalar_type_regex.match(type_lower)) is None: raise TypeNotFoundError(f"Type has unexpected format: {type_name}") return self.scalar_type_to_cpp(match.group("typename")) @@ -131,16 +131,12 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]: ) return None - return_type = self.type_to_cpp_type_name(function_type["return_type"]) - class_type = self.lookup_type(function_type.get("class_type")) + class_type = function_type.get("class_type") arg_list_type = self.lookup_type(function_type.get("arg_list_type")) assert arg_list_type is not None arg_list_pdb_types = arg_list_type.get("args", []) assert arg_list_type["argcount"] == len(arg_list_pdb_types) - arglist = [ - self.type_to_cpp_type_name(argtype) for argtype in arg_list_pdb_types - ] stack_symbols: list[CppStackOrRegisterSymbol] = [] for symbol in fn.stack_symbols: @@ -157,7 +153,7 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]: stack_symbols.append( CppStackSymbol( symbol.name, - self.type_to_cpp_type_name(symbol.data_type), + symbol.data_type, stack_offset, ) ) @@ -166,8 +162,8 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]: return FunctionSignature( call_type=call_type, - arglist=arglist, - return_type=return_type, + arglist=arg_list_pdb_types, + return_type=function_type["return_type"], class_type=class_type, stack_symbols=stack_symbols, ) @@ -175,7 +171,7 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]: def get_function_list(self) -> list[tuple[MatchInfo, FunctionSignature]]: handled = ( self.handle_matched_function(match) - for match in self.compare._db.get_matches_by_type(SymbolType.FUNCTION) + for match in self.compare.db.get_matches_by_type(SymbolType.FUNCTION) ) return [signature for signature in handled if signature is not None] @@ -183,7 +179,7 @@ def handle_matched_function( self, match_info: MatchInfo ) -> Optional[tuple[MatchInfo, FunctionSignature]]: assert match_info.orig_addr is not None - match_options = self.compare._db.get_match_options(match_info.orig_addr) + match_options = self.compare.db.get_match_options(match_info.orig_addr) assert match_options is not None if match_options.get("skip", False) or match_options.get("stub", False): return None diff --git a/tools/ghidra_scripts/lego_util/type_importer.py b/tools/ghidra_scripts/lego_util/type_importer.py new file mode 100644 index 00000000..b86479d1 --- /dev/null +++ b/tools/ghidra_scripts/lego_util/type_importer.py @@ -0,0 +1,260 @@ +from typing import Any + +# Disable spurious warnings in vscode / pylance +# pyright: reportMissingModuleSource=false + +from lego_util.exceptions import ( + ClassOrNamespaceNotFoundInGhidraError, + TypeNotFoundError, + TypeNotFoundInGhidraError, + TypeNotImplementedError, +) +from lego_util.ghidra_helper import ( + add_pointer_type, + create_ghidra_namespace, + get_ghidra_namespace, + get_ghidra_type, + sanitize_class_name, +) +from lego_util.pdb_extraction import PdbExtractionForGhidraMigration +from lego_util.function_importer import logger + + +from ghidra.program.flatapi import FlatProgramAPI +from ghidra.program.model.data import ( + ArrayDataType, + CategoryPath, + DataType, + DataTypeConflictHandler, + StructureDataType, + StructureInternal, +) +from ghidra.util.task import ConsoleTaskMonitor + + +class PdbTypeImporter: + def __init__( + self, api: FlatProgramAPI, extraction: PdbExtractionForGhidraMigration + ): + self.api = api + self.extraction = extraction + self.handled_structs: set[str] = ( + set() + ) # tracks the types we have already imported, otherwise we keep overwriting finished work + + @property + def types(self): + return self.extraction.compare.cv.types + + def _import_class_or_struct(self, type_in_pdb: dict[str, Any]) -> DataType: + field_list_type = type_in_pdb.get("field_list_type") + if field_list_type is None: + raise TypeNotFoundError( + f"Found a referenced missing type that is not a class or lacks a field_list_type: {type_in_pdb}" + ) + + field_list = self.types.keys[field_list_type.lower()] + logger.debug("Found class: %s", type_in_pdb) + + class_size: int = type_in_pdb["size"] + class_name_with_namespace: str = sanitize_class_name(type_in_pdb["name"]) + + if class_name_with_namespace in self.handled_structs: + logger.debug( + "Class has been handled or is being handled: %s", + class_name_with_namespace, + ) + return get_ghidra_type(self.api, class_name_with_namespace) + + # Add as soon as we start to avoid infinite recursion + self.handled_structs.add(class_name_with_namespace) + + # Create class / namespace if it does not exist + colon_split = class_name_with_namespace.split("::") + class_name = colon_split[-1] + try: + get_ghidra_namespace(self.api, colon_split) + logger.debug("Found existing class/namespace %s", class_name_with_namespace) + except ClassOrNamespaceNotFoundInGhidraError: + logger.info("Creating class/namespace %s", class_name_with_namespace) + class_name = colon_split.pop() + parent_namespace = create_ghidra_namespace(self.api, colon_split) + self.api.createClass(parent_namespace, class_name) + + # Create type if it does not exist + try: + data_type = get_ghidra_type(self.api, class_name_with_namespace) + logger.debug( + "Found existing data type %s under category path %s", + class_name_with_namespace, + data_type.getCategoryPath(), + ) + except TypeNotFoundInGhidraError: + # Create a new struct data type + data_type = StructureDataType( + CategoryPath("/imported"), class_name_with_namespace, class_size + ) + data_type = ( + self.api.getCurrentProgram() + .getDataTypeManager() + .addDataType(data_type, DataTypeConflictHandler.KEEP_HANDLER) + ) + logger.info("Created new data type %s", class_name_with_namespace) + + assert isinstance( + data_type, StructureInternal + ), f"Found type sharing its name with a class/struct, but is not a struct: {class_name_with_namespace}" + + if (old_size := data_type.getLength()) != class_size: + logger.warning( + "Existing class %s had incorrect size %d. Setting to %d...", + class_name_with_namespace, + old_size, + class_size, + ) + # TODO: Implement comparison to expected layout + # We might not need that, but it helps to not break stuff if we run into an error + + logger.info("Adding class data type %s", class_name_with_namespace) + logger.debug("Class information: %s", type_in_pdb) + + data_type.deleteAll() + data_type.growStructure(class_size) + + # this case happened for IUnknown, which linked to an (incorrect) existing library, and some other types as well. + # Unfortunately, we don't get proper error handling for read-only types + if data_type.getLength() != class_size: + logger.warning( + "Failed to modify data type %s. Please remove the existing one by hand and try again.", + class_name_with_namespace, + ) + + assert ( + self.api.getCurrentProgram() + .getDataTypeManager() + .remove(data_type, ConsoleTaskMonitor()) + ), f"Failed to delete and re-create data type {class_name_with_namespace}" + data_type = StructureDataType( + CategoryPath("/imported"), class_name_with_namespace, class_size + ) + data_type = ( + self.api.getCurrentProgram() + .getDataTypeManager() + .addDataType(data_type, DataTypeConflictHandler.KEEP_HANDLER) + ) + assert isinstance(data_type, StructureInternal) # for type checking + + # Delete existing components - likely not needed when using replaceAtOffset exhaustively + # for component in data_type.getComponents(): + # data_type.deleteAtOffset(component.getOffset()) + + # can be missing when no new fields are declared + components: list[dict[str, Any]] = field_list.get("members") or [] + + super_type = field_list.get("super") + if super_type is not None: + components.insert(0, {"type": super_type, "offset": 0, "name": "base"}) + + for component in components: + ghidra_type = self.pdb_to_ghidra_type(component["type"]) + logger.debug("Adding component to class: %s", component) + # XXX: temporary exception handling to get better logs + try: + data_type.replaceAtOffset( + component["offset"], ghidra_type, -1, component["name"], None + ) + except Exception as e: + raise Exception(f"Error importing {type_in_pdb}") from e + + logger.info("Finished importing class %s", class_name_with_namespace) + + return data_type + + def pdb_to_ghidra_type(self, type_index: str) -> DataType: + """ + Experimental new type converter to get rid of the intermediate step PDB -> C++ -> Ghidra + + @param type_index Either a scalar type like `T_INT4(...)` or a PDB reference like `0x10ba` + """ + # scalar type + type_index_lower = type_index.lower() + if type_index_lower.startswith("t_"): + if ( + match := self.extraction.scalar_type_regex.match(type_index_lower) + ) is None: + raise TypeNotFoundError(f"Type has unexpected format: {type_index}") + + scalar_cpp_type = self.extraction.scalar_type_to_cpp( + match.group("typename") + ) + return get_ghidra_type(self.api, scalar_cpp_type) + + try: + type_pdb = self.extraction.compare.cv.types.keys[type_index_lower] + except KeyError as e: + raise TypeNotFoundError( + f"Failed to find referenced type {type_index_lower}" + ) from e + + type_category = type_pdb["type"] + + if type_category == "LF_POINTER": + return add_pointer_type( + self.api, self.pdb_to_ghidra_type(type_pdb["element_type"]) + ) + + if type_category in ["LF_CLASS", "LF_STRUCTURE"]: + if type_pdb.get("is_forward_ref", False): + logger.debug( + "Following forward reference from %s to %s", + type_index, + type_pdb["udt"], + ) + return self.pdb_to_ghidra_type(type_pdb["udt"]) + + return self._import_class_or_struct(type_pdb) + + if type_category == "LF_ARRAY": + # TODO: See how well this interacts with arrays in functions + # We treat arrays like pointers because we don't distinguish them in Ghidra + logger.debug("Encountered array: %s", type_pdb) + inner_type = self.pdb_to_ghidra_type(type_pdb["array_type"]) + + # TODO: Insert size / consider switching to pointer if not applicable + return ArrayDataType(inner_type, 0, 0) + + if type_category == "LF_ENUM": + logger.warning( + "Replacing enum by underlying type (not implemented yet): %s", type_pdb + ) + return self.pdb_to_ghidra_type(type_pdb["underlying_type"]) + + if type_category == "LF_MODIFIER": + logger.warning("Not sure what a modifier is: %s", type_pdb) + # not sure what this actually is, take what it references + return self.pdb_to_ghidra_type(type_pdb["modifies"]) + + if type_category == "LF_PROCEDURE": + logger.info( + "Function-valued argument or return type will be replaced by void pointer: %s", + type_pdb, + ) + return get_ghidra_type(self.api, "void") + + if type_category == "LF_UNION": + if type_pdb.get("is_forward_ref", False): + return self.pdb_to_ghidra_type(type_pdb["udt"]) + + try: + logger.debug("Dereferencing union %s", type_pdb) + union_type = get_ghidra_type(self.api, type_pdb["name"]) + assert ( + union_type.getLength() == type_pdb["size"] + ), f"Wrong size of existing union type '{type_pdb['name']}': expected {type_pdb["size"]}, got {union_type.getLength()}" + return union_type + except TypeNotFoundInGhidraError as e: + raise TypeNotImplementedError( + f"Writing union types is not supported. Please add by hand: {type_pdb}" + ) from e + + raise TypeNotImplementedError(type_pdb) diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index 0c84eb92..a06e623b 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -88,6 +88,11 @@ def __init__( self._match_thunks() self._find_vtordisp() + @property + def db(self): + """Newer code needs to access this field, legacy code uses _db""" + return self._db + def _load_cvdump(self): logger.info("Parsing %s ...", self.pdb_file) self.cv = ( @@ -161,7 +166,10 @@ def _load_cvdump(self): addr, sym.node_type, sym.name(), sym.decorated_name, sym.size() ) - for (section, offset), (filename, line_no) in self.cvdump_analysis.verified_lines.items(): + for (section, offset), ( + filename, + line_no, + ) in self.cvdump_analysis.verified_lines.items(): addr = self.recomp_bin.get_abs_addr(section, offset) self._lines_db.add_line(filename, line_no, addr) diff --git a/tools/isledecomp/isledecomp/cvdump/types.py b/tools/isledecomp/isledecomp/cvdump/types.py index 81eee5c2..9776cc39 100644 --- a/tools/isledecomp/isledecomp/cvdump/types.py +++ b/tools/isledecomp/isledecomp/cvdump/types.py @@ -216,6 +216,9 @@ class CvdumpTypesParser: re.compile(r"^\s*enum name = (?P.+)$"), re.compile(r"^\s*UDT\((?P0x\w+)\)$"), ] + LF_UNION_LINE = re.compile( + r".*field list type (?P0x\w+),.*Size = (?P\d+)\s*,class name = (?P(?:[^,]|,\S)+),\s.*UDT\((?P0x\w+)\)" + ) MODES_OF_INTEREST = { "LF_ARRAY", @@ -228,6 +231,7 @@ class CvdumpTypesParser: "LF_ARGLIST", "LF_MFUNCTION", "LF_PROCEDURE", + "LF_UNION", } def __init__(self) -> None: @@ -298,7 +302,9 @@ def _mock_array_members(self, type_obj: Dict) -> List[FieldListItem]: raise CvdumpIntegrityError("No array element type") array_element_size = self.get(array_type).size - assert array_element_size is not None, "Encountered an array whose type has no size" + assert ( + array_element_size is not None + ), "Encountered an array whose type has no size" n_elements = type_obj["size"] // array_element_size @@ -399,7 +405,9 @@ def get_scalars_gapless(self, type_key: str) -> List[ScalarType]: obj = self.get(type_key) total_size = obj.size - assert total_size is not None, "Called get_scalar_gapless() on a type without size" + assert ( + total_size is not None + ), "Called get_scalar_gapless() on a type without size" scalars = self.get_scalars(type_key) @@ -506,6 +514,9 @@ def read_line(self, line: str): elif self.mode == "LF_ENUM": self.read_enum_line(line) + elif self.mode == "LF_UNION": + self.read_union_line(line) + else: # Check for exhaustiveness logger.error("Unhandled data in mode: %s", self.mode) @@ -610,3 +621,14 @@ def parse_enum_attribute(self, attribute: str) -> dict[str, Any]: return {"is_forward_ref": True} logger.error("Unknown attribute in enum: %s", attribute) return {} + + def read_union_line(self, line: str): + """This is a rather barebones handler, only parsing the size""" + if (match := self.LF_UNION_LINE.match(line)) is None: + raise AssertionError(f"Unhandled in union: {line}") + self._set("name", match.group("name")) + if match.group("field_type") == "0x0000": + self._set("is_forward_ref", True) + + self._set("size", int(match.group("size"))) + self._set("udt", normalize_type_id(match.group("udt")))