From a8f6e72b978ac664dacfb009212ebae5a0f5038e Mon Sep 17 00:00:00 2001
From: jonschz <jonschz@users.noreply.github.com>
Date: Sun, 26 May 2024 21:58:19 +0200
Subject: [PATCH] Implement struct imports [skip ci]

- This code is still in dire need of refactoring and tests
- There are only single-digit issues left, and 2600 functions can be imported
- The biggest remaining error is mismatched stacks
---
 .../import_functions_from_pdb.py              |  66 +++--
 tools/ghidra_scripts/lego_util/exceptions.py  |  15 +-
 ...{pdb_to_ghidra.py => function_importer.py} |  36 ++-
 .../ghidra_scripts/lego_util/ghidra_helper.py |  58 +++-
 .../lego_util/pdb_extraction.py               |  36 ++-
 .../ghidra_scripts/lego_util/type_importer.py | 260 ++++++++++++++++++
 tools/isledecomp/isledecomp/compare/core.py   |  10 +-
 tools/isledecomp/isledecomp/cvdump/types.py   |  26 +-
 8 files changed, 448 insertions(+), 59 deletions(-)
 rename tools/ghidra_scripts/lego_util/{pdb_to_ghidra.py => function_importer.py} (88%)
 create mode 100644 tools/ghidra_scripts/lego_util/type_importer.py

diff --git a/tools/ghidra_scripts/import_functions_from_pdb.py b/tools/ghidra_scripts/import_functions_from_pdb.py
index 3eb9405d..733936d0 100644
--- a/tools/ghidra_scripts/import_functions_from_pdb.py
+++ b/tools/ghidra_scripts/import_functions_from_pdb.py
@@ -15,6 +15,7 @@
 
 import importlib
 from dataclasses import dataclass, field
+import logging.handlers
 import sys
 import logging
 from pathlib import Path
@@ -44,18 +45,25 @@ def reload_module(module: str):
 
 
 def setup_logging():
-    logging.basicConfig(
-        format="%(levelname)-8s %(message)s",
-        stream=sys.stdout,
-        level=logging.INFO,
-        force=True,
+    logging.root.handlers.clear()
+    formatter = logging.Formatter("%(levelname)-8s %(message)s")
+    # formatter = logging.Formatter("%(name)s %(levelname)-8s %(message)s") # use this to identify loggers
+    stdout_handler = logging.StreamHandler(sys.stdout)
+    stdout_handler.setFormatter(formatter)
+    file_handler = logging.FileHandler(
+        Path(__file__).absolute().parent.joinpath("import.log"), mode="w"
     )
+    file_handler.setFormatter(formatter)
+    logging.root.setLevel(GLOBALS.loglevel)
+    logging.root.addHandler(stdout_handler)
+    logging.root.addHandler(file_handler)
     logger.info("Starting...")
 
 
 @dataclass
 class Globals:
     verbose: bool
+    loglevel: int
     running_from_ghidra: bool = False
     make_changes: bool = False
     prompt_before_changes: bool = True
@@ -64,7 +72,11 @@ class Globals:
 
 
 # hard-coded settings that we don't want to prompt in Ghidra every time
-GLOBALS = Globals(verbose=False)
+GLOBALS = Globals(
+    verbose=False,
+    # loglevel=logging.INFO,
+    loglevel=logging.DEBUG,
+)
 
 
 # Disable spurious warnings in vscode / pylance
@@ -111,14 +123,19 @@ def add_python_path(path: str):
 
 # We need to quote the types here because they might not exist when running without Ghidra
 def migrate_function_to_ghidra(
-    api: "FlatProgramAPI", match_info: "MatchInfo", signature: "FunctionSignature"
+    api: "FlatProgramAPI",
+    match_info: "MatchInfo",
+    signature: "FunctionSignature",
+    type_importer: "PdbTypeImporter",
 ):
     hex_original_address = f"{match_info.orig_addr:x}"
 
     # Find the Ghidra function at that address
     ghidra_address = getAddressFactory().getAddress(hex_original_address)
 
-    typed_pdb_function = PdbFunctionWithGhidraObjects(api, match_info, signature)
+    typed_pdb_function = PdbFunctionWithGhidraObjects(
+        api, match_info, signature, type_importer
+    )
 
     if not GLOBALS.make_changes:
         return
@@ -170,19 +187,20 @@ def migrate_function_to_ghidra(
         askChoice("Continue", "Click 'OK' to continue", ["OK"], "OK")
 
 
-def process_functions(isle_compare: "IsleCompare"):
-    # try to acquire matched functions
-    migration = PdbExtractionForGhidraMigration(isle_compare)
-    func_signatures = migration.get_function_list()
+def process_functions(extraction: "PdbExtractionForGhidraMigration"):
+    func_signatures = extraction.get_function_list()
 
     if not GLOBALS.running_from_ghidra:
         logger.info("Completed the dry run outside Ghidra.")
         return
 
-    fpapi = FlatProgramAPI(currentProgram())
+    api = FlatProgramAPI(currentProgram())
+    # TODO: Implement a "no changes" mode
+    type_importer = PdbTypeImporter(api, extraction)
+
     for match_info, signature in func_signatures:
         try:
-            migrate_function_to_ghidra(fpapi, match_info, signature)
+            migrate_function_to_ghidra(api, match_info, signature, type_importer)
             GLOBALS.statistics.successes += 1
         except Lego1Exception as e:
             log_and_track_failure(e)
@@ -216,8 +234,11 @@ def main():
     pdb_path = build_path.joinpath("LEGO1.pdb")
 
     if not GLOBALS.verbose:
-        logging.getLogger("isledecomp.compare.db").setLevel(logging.CRITICAL)
-        logging.getLogger("isledecomp.compare.lines").setLevel(logging.CRITICAL)
+        logging.getLogger("isledecomp.bin").setLevel(logging.WARNING)
+        logging.getLogger("isledecomp.compare.core").setLevel(logging.WARNING)
+        logging.getLogger("isledecomp.compare.db").setLevel(logging.WARNING)
+        logging.getLogger("isledecomp.compare.lines").setLevel(logging.WARNING)
+        logging.getLogger("isledecomp.cvdump.symbols").setLevel(logging.WARNING)
 
     logger.info("Starting comparison")
     with Bin(str(origfile_path), find_str=True) as origfile, Bin(
@@ -227,8 +248,10 @@ def main():
 
     logger.info("Comparison complete.")
 
+    # try to acquire matched functions
+    migration = PdbExtractionForGhidraMigration(isle_compare)
     try:
-        process_functions(isle_compare)
+        process_functions(migration)
     finally:
         if GLOBALS.running_from_ghidra:
             GLOBALS.statistics.log()
@@ -265,8 +288,13 @@ def main():
     )
 
     if GLOBALS.running_from_ghidra:
-        reload_module("lego_util.pdb_to_ghidra")
-        from lego_util.pdb_to_ghidra import PdbFunctionWithGhidraObjects
+        reload_module("lego_util.ghidra_helper")
+
+        reload_module("lego_util.function_importer")
+        from lego_util.function_importer import PdbFunctionWithGhidraObjects
+
+        reload_module("lego_util.type_importer")
+        from lego_util.type_importer import PdbTypeImporter
 
     if __name__ == "__main__":
         main()
diff --git a/tools/ghidra_scripts/lego_util/exceptions.py b/tools/ghidra_scripts/lego_util/exceptions.py
index f120e717..e44c10f2 100644
--- a/tools/ghidra_scripts/lego_util/exceptions.py
+++ b/tools/ghidra_scripts/lego_util/exceptions.py
@@ -1,5 +1,13 @@
 class Lego1Exception(Exception):
-    pass
+    """
+    Our own base class for exceptions.
+    Makes it easier to distinguish expected and unexpected errors.
+    """
+
+
+class TypeNotFoundError(Lego1Exception):
+    def __str__(self):
+        return f"Type not found in PDB: {self.args[0]}"
 
 
 class TypeNotFoundInGhidraError(Lego1Exception):
@@ -7,6 +15,11 @@ def __str__(self):
         return f"Type not found in Ghidra: {self.args[0]}"
 
 
+class TypeNotImplementedError(Lego1Exception):
+    def __str__(self):
+        return f"Import not implemented for type: {self.args[0]}"
+
+
 class ClassOrNamespaceNotFoundInGhidraError(Lego1Exception):
     def __init__(self, namespaceHierachy: list[str]):
         super().__init__(namespaceHierachy)
diff --git a/tools/ghidra_scripts/lego_util/pdb_to_ghidra.py b/tools/ghidra_scripts/lego_util/function_importer.py
similarity index 88%
rename from tools/ghidra_scripts/lego_util/pdb_to_ghidra.py
rename to tools/ghidra_scripts/lego_util/function_importer.py
index d312b004..c8f61e41 100644
--- a/tools/ghidra_scripts/lego_util/pdb_to_ghidra.py
+++ b/tools/ghidra_scripts/lego_util/function_importer.py
@@ -20,9 +20,11 @@
 )
 from lego_util.ghidra_helper import (
     get_ghidra_namespace,
-    get_ghidra_type,
+    sanitize_class_name,
 )
+
 from lego_util.exceptions import StackOffsetMismatchError
+from lego_util.type_importer import PdbTypeImporter
 
 
 logger = logging.getLogger(__name__)
@@ -33,26 +35,36 @@ class PdbFunctionWithGhidraObjects:
 
     def __init__(
         self,
-        fpapi: FlatProgramAPI,
+        api: FlatProgramAPI,
         match_info: MatchInfo,
         signature: FunctionSignature,
+        type_importer: "PdbTypeImporter",
     ):
-        self.api = fpapi
+        self.api = api
         self.match_info = match_info
         self.signature = signature
+        self.type_importer = type_importer
+
+        if signature.class_type is not None:
+            # Import the base class so the namespace exists
+            self.type_importer.pdb_to_ghidra_type(signature.class_type)
 
         assert match_info.name is not None
-        colon_split = match_info.name.split("::")
+
+        colon_split = sanitize_class_name(match_info.name).split("::")
         self.name = colon_split.pop()
         namespace_hierachy = colon_split
-        self.namespace = get_ghidra_namespace(fpapi, namespace_hierachy)
+        self.namespace = get_ghidra_namespace(api, namespace_hierachy)
 
-        self.return_type = get_ghidra_type(fpapi, signature.return_type)
+        self.return_type = type_importer.pdb_to_ghidra_type(
+            signature.return_type
+        )
         self.arguments = [
             ParameterImpl(
                 f"param{index}",
-                get_ghidra_type(fpapi, type_name),
-                fpapi.getCurrentProgram(),
+                # get_ghidra_type(api, type_name),
+                type_importer.pdb_to_ghidra_type(type_name),
+                api.getCurrentProgram(),
             )
             for (index, type_name) in enumerate(signature.arglist)
         ]
@@ -200,7 +212,13 @@ def _rename_stack_parameter(self, param: Parameter):
                 f"Could not find a matching symbol at offset {param.getStackOffset()} in {self.get_full_name()}"
             )
 
-        if param.getDataType() != get_ghidra_type(self.api, match.data_type):
+        if match.data_type == "T_NOTYPE(0000)":
+            logger.warning("Skipping stack parameter of type NOTYPE")
+            return
+
+        if param.getDataType() != self.type_importer.pdb_to_ghidra_type(
+            match.data_type
+        ):
             logger.error(
                 "Type mismatch for parameter: %s in Ghidra, %s in PDB", param, match
             )
diff --git a/tools/ghidra_scripts/lego_util/ghidra_helper.py b/tools/ghidra_scripts/lego_util/ghidra_helper.py
index 20132ea3..39b7c351 100644
--- a/tools/ghidra_scripts/lego_util/ghidra_helper.py
+++ b/tools/ghidra_scripts/lego_util/ghidra_helper.py
@@ -16,6 +16,8 @@
 from ghidra.program.model.data import DataType
 from ghidra.program.model.symbol import Namespace
 
+logger = logging.getLogger(__name__)
+
 
 def get_ghidra_type(api: FlatProgramAPI, type_name: str):
     """
@@ -44,14 +46,21 @@ def get_ghidra_type(api: FlatProgramAPI, type_name: str):
     raise MultipleTypesFoundInGhidraError(type_name, result)
 
 
-def add_pointer_type(api: FlatProgramAPI, pointee: DataType):
-    data_type = PointerDataType(pointee)
-    data_type.setCategoryPath(pointee.getCategoryPath())
-    api.getCurrentProgram().getDataTypeManager().addDataType(
-        data_type, DataTypeConflictHandler.KEEP_HANDLER
+def add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType:
+    new_data_type = PointerDataType(pointee)
+    new_data_type.setCategoryPath(pointee.getCategoryPath())
+    result_data_type = (
+        api.getCurrentProgram()
+        .getDataTypeManager()
+        .addDataType(new_data_type, DataTypeConflictHandler.KEEP_HANDLER)
     )
-    logging.info("Created new pointer type %s", data_type)
-    return data_type
+    if result_data_type is not new_data_type:
+        logger.debug(
+            "New pointer replaced by existing one. Fresh pointer: %s (class: %s)",
+            result_data_type,
+            result_data_type.__class__,
+        )
+    return result_data_type
 
 
 def get_ghidra_namespace(
@@ -63,3 +72,38 @@ def get_ghidra_namespace(
         if namespace is None:
             raise ClassOrNamespaceNotFoundInGhidraError(namespace_hierachy)
     return namespace
+
+
+def create_ghidra_namespace(
+    api: FlatProgramAPI, namespace_hierachy: list[str]
+) -> Namespace:
+    namespace = api.getCurrentProgram().getGlobalNamespace()
+    for part in namespace_hierachy:
+        namespace = api.getNamespace(namespace, part)
+        if namespace is None:
+            namespace = api.createNamespace(namespace, part)
+    return namespace
+
+
+def sanitize_class_name(name: str) -> str:
+    """
+    Takes a full class or function name and replaces characters not accepted by Ghidra.
+    Applies mostly to templates.
+    """
+    if "<" in name:
+        new_class_name = (
+            "_template_" +
+            name
+                .replace("<", "[")
+                .replace(">", "]")
+                .replace("*", "#")
+                .replace(" ", "")
+        )
+        logger.warning(
+            "Changing possible template class name from '%s' to '%s'",
+            name,
+            new_class_name,
+        )
+        return new_class_name
+
+    return name
diff --git a/tools/ghidra_scripts/lego_util/pdb_extraction.py b/tools/ghidra_scripts/lego_util/pdb_extraction.py
index 7d47c27e..9c884ef4 100644
--- a/tools/ghidra_scripts/lego_util/pdb_extraction.py
+++ b/tools/ghidra_scripts/lego_util/pdb_extraction.py
@@ -8,13 +8,11 @@
 from isledecomp.compare import Compare as IsleCompare
 from isledecomp.compare.db import MatchInfo
 
+from lego_util.exceptions import TypeNotFoundError
+
 logger = logging.getLogger(__file__)
 
 
-class TypeNotFoundError(Exception):
-    pass
-
-
 @dataclass
 class CppStackOrRegisterSymbol:
     name: str
@@ -38,7 +36,7 @@ class FunctionSignature:
     call_type: str
     arglist: list[str]
     return_type: str
-    class_type: Optional[dict[str, Any]]
+    class_type: Optional[str]
     stack_symbols: list[CppStackOrRegisterSymbol]
 
 
@@ -46,7 +44,7 @@ class PdbExtractionForGhidraMigration:
     def __init__(self, compare: IsleCompare):
         self.compare = compare
 
-    _scalar_type_regex = re.compile(r"t_(?P<typename>\w+)(?:\((?P<type_id>\d+)\))?")
+    scalar_type_regex = re.compile(r"t_(?P<typename>\w+)(?:\((?P<type_id>\d+)\))?")
 
     _scalar_type_map = {
         "rchar": "char",
@@ -62,10 +60,11 @@ def __init__(self, compare: IsleCompare):
         "STD Near": "__stdcall",
     }
 
-    def scalar_type_to_cpp(self, scalar_type: str) -> str:
+    @classmethod
+    def scalar_type_to_cpp(cls, scalar_type: str) -> str:
         if scalar_type.startswith("32p"):
-            return f"{self.scalar_type_to_cpp(scalar_type[3:])} *"
-        return self._scalar_type_map.get(scalar_type, scalar_type)
+            return f"{cls.scalar_type_to_cpp(scalar_type[3:])} *"
+        return cls._scalar_type_map.get(scalar_type, scalar_type)
 
     def lookup_type(self, type_name: Optional[str]) -> Optional[dict[str, Any]]:
         return (
@@ -74,11 +73,12 @@ def lookup_type(self, type_name: Optional[str]) -> Optional[dict[str, Any]]:
             else self.compare.cv.types.keys.get(type_name.lower())
         )
 
+    # TODO: This is mostly legacy code now, we may be able to remove it
     def type_to_cpp_type_name(self, type_name: str) -> str:
         # pylint: disable=too-many-return-statements
         type_lower = type_name.lower()
         if type_lower.startswith("t_"):
-            if (match := self._scalar_type_regex.match(type_lower)) is None:
+            if (match := self.scalar_type_regex.match(type_lower)) is None:
                 raise TypeNotFoundError(f"Type has unexpected format: {type_name}")
 
             return self.scalar_type_to_cpp(match.group("typename"))
@@ -131,16 +131,12 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]:
             )
             return None
 
-        return_type = self.type_to_cpp_type_name(function_type["return_type"])
-        class_type = self.lookup_type(function_type.get("class_type"))
+        class_type = function_type.get("class_type")
 
         arg_list_type = self.lookup_type(function_type.get("arg_list_type"))
         assert arg_list_type is not None
         arg_list_pdb_types = arg_list_type.get("args", [])
         assert arg_list_type["argcount"] == len(arg_list_pdb_types)
-        arglist = [
-            self.type_to_cpp_type_name(argtype) for argtype in arg_list_pdb_types
-        ]
 
         stack_symbols: list[CppStackOrRegisterSymbol] = []
         for symbol in fn.stack_symbols:
@@ -157,7 +153,7 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]:
                 stack_symbols.append(
                     CppStackSymbol(
                         symbol.name,
-                        self.type_to_cpp_type_name(symbol.data_type),
+                        symbol.data_type,
                         stack_offset,
                     )
                 )
@@ -166,8 +162,8 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]:
 
         return FunctionSignature(
             call_type=call_type,
-            arglist=arglist,
-            return_type=return_type,
+            arglist=arg_list_pdb_types,
+            return_type=function_type["return_type"],
             class_type=class_type,
             stack_symbols=stack_symbols,
         )
@@ -175,7 +171,7 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]:
     def get_function_list(self) -> list[tuple[MatchInfo, FunctionSignature]]:
         handled = (
             self.handle_matched_function(match)
-            for match in self.compare._db.get_matches_by_type(SymbolType.FUNCTION)
+            for match in self.compare.db.get_matches_by_type(SymbolType.FUNCTION)
         )
         return [signature for signature in handled if signature is not None]
 
@@ -183,7 +179,7 @@ def handle_matched_function(
         self, match_info: MatchInfo
     ) -> Optional[tuple[MatchInfo, FunctionSignature]]:
         assert match_info.orig_addr is not None
-        match_options = self.compare._db.get_match_options(match_info.orig_addr)
+        match_options = self.compare.db.get_match_options(match_info.orig_addr)
         assert match_options is not None
         if match_options.get("skip", False) or match_options.get("stub", False):
             return None
diff --git a/tools/ghidra_scripts/lego_util/type_importer.py b/tools/ghidra_scripts/lego_util/type_importer.py
new file mode 100644
index 00000000..b86479d1
--- /dev/null
+++ b/tools/ghidra_scripts/lego_util/type_importer.py
@@ -0,0 +1,260 @@
+from typing import Any
+
+# Disable spurious warnings in vscode / pylance
+# pyright: reportMissingModuleSource=false
+
+from lego_util.exceptions import (
+    ClassOrNamespaceNotFoundInGhidraError,
+    TypeNotFoundError,
+    TypeNotFoundInGhidraError,
+    TypeNotImplementedError,
+)
+from lego_util.ghidra_helper import (
+    add_pointer_type,
+    create_ghidra_namespace,
+    get_ghidra_namespace,
+    get_ghidra_type,
+    sanitize_class_name,
+)
+from lego_util.pdb_extraction import PdbExtractionForGhidraMigration
+from lego_util.function_importer import logger
+
+
+from ghidra.program.flatapi import FlatProgramAPI
+from ghidra.program.model.data import (
+    ArrayDataType,
+    CategoryPath,
+    DataType,
+    DataTypeConflictHandler,
+    StructureDataType,
+    StructureInternal,
+)
+from ghidra.util.task import ConsoleTaskMonitor
+
+
+class PdbTypeImporter:
+    def __init__(
+        self, api: FlatProgramAPI, extraction: PdbExtractionForGhidraMigration
+    ):
+        self.api = api
+        self.extraction = extraction
+        self.handled_structs: set[str] = (
+            set()
+        )  # tracks the types we have already imported, otherwise we keep overwriting finished work
+
+    @property
+    def types(self):
+        return self.extraction.compare.cv.types
+
+    def _import_class_or_struct(self, type_in_pdb: dict[str, Any]) -> DataType:
+        field_list_type = type_in_pdb.get("field_list_type")
+        if field_list_type is None:
+            raise TypeNotFoundError(
+                f"Found a referenced missing type that is not a class or lacks a field_list_type: {type_in_pdb}"
+            )
+
+        field_list = self.types.keys[field_list_type.lower()]
+        logger.debug("Found class: %s", type_in_pdb)
+
+        class_size: int = type_in_pdb["size"]
+        class_name_with_namespace: str = sanitize_class_name(type_in_pdb["name"])
+
+        if class_name_with_namespace in self.handled_structs:
+            logger.debug(
+                "Class has been handled or is being handled: %s",
+                class_name_with_namespace,
+            )
+            return get_ghidra_type(self.api, class_name_with_namespace)
+
+        # Add as soon as we start to avoid infinite recursion
+        self.handled_structs.add(class_name_with_namespace)
+
+        # Create class / namespace if it does not exist
+        colon_split = class_name_with_namespace.split("::")
+        class_name = colon_split[-1]
+        try:
+            get_ghidra_namespace(self.api, colon_split)
+            logger.debug("Found existing class/namespace %s", class_name_with_namespace)
+        except ClassOrNamespaceNotFoundInGhidraError:
+            logger.info("Creating class/namespace %s", class_name_with_namespace)
+            class_name = colon_split.pop()
+            parent_namespace = create_ghidra_namespace(self.api, colon_split)
+            self.api.createClass(parent_namespace, class_name)
+
+        # Create type if it does not exist
+        try:
+            data_type = get_ghidra_type(self.api, class_name_with_namespace)
+            logger.debug(
+                "Found existing data type %s under category path %s",
+                class_name_with_namespace,
+                data_type.getCategoryPath(),
+            )
+        except TypeNotFoundInGhidraError:
+            # Create a new struct data type
+            data_type = StructureDataType(
+                CategoryPath("/imported"), class_name_with_namespace, class_size
+            )
+            data_type = (
+                self.api.getCurrentProgram()
+                .getDataTypeManager()
+                .addDataType(data_type, DataTypeConflictHandler.KEEP_HANDLER)
+            )
+            logger.info("Created new data type %s", class_name_with_namespace)
+
+        assert isinstance(
+            data_type, StructureInternal
+        ), f"Found type sharing its name with a class/struct, but is not a struct: {class_name_with_namespace}"
+
+        if (old_size := data_type.getLength()) != class_size:
+            logger.warning(
+                "Existing class %s had incorrect size %d. Setting to %d...",
+                class_name_with_namespace,
+                old_size,
+                class_size,
+            )
+            # TODO: Implement comparison to expected layout
+            # We might not need that, but it helps to not break stuff if we run into an error
+
+        logger.info("Adding class data type %s", class_name_with_namespace)
+        logger.debug("Class information: %s", type_in_pdb)
+
+        data_type.deleteAll()
+        data_type.growStructure(class_size)
+
+        # this case happened for IUnknown, which linked to an (incorrect) existing library, and some other types as well.
+        # Unfortunately, we don't get proper error handling for read-only types
+        if data_type.getLength() != class_size:
+            logger.warning(
+                "Failed to modify data type %s. Please remove the existing one by hand and try again.",
+                class_name_with_namespace,
+            )
+
+            assert (
+                self.api.getCurrentProgram()
+                .getDataTypeManager()
+                .remove(data_type, ConsoleTaskMonitor())
+            ), f"Failed to delete and re-create data type {class_name_with_namespace}"
+            data_type = StructureDataType(
+                CategoryPath("/imported"), class_name_with_namespace, class_size
+            )
+            data_type = (
+                self.api.getCurrentProgram()
+                .getDataTypeManager()
+                .addDataType(data_type, DataTypeConflictHandler.KEEP_HANDLER)
+            )
+            assert isinstance(data_type, StructureInternal)  # for type checking
+
+        # Delete existing components - likely not needed when using replaceAtOffset exhaustively
+        # for component in data_type.getComponents():
+        #     data_type.deleteAtOffset(component.getOffset())
+
+        # can be missing when no new fields are declared
+        components: list[dict[str, Any]] = field_list.get("members") or []
+
+        super_type = field_list.get("super")
+        if super_type is not None:
+            components.insert(0, {"type": super_type, "offset": 0, "name": "base"})
+
+        for component in components:
+            ghidra_type = self.pdb_to_ghidra_type(component["type"])
+            logger.debug("Adding component to class: %s", component)
+            # XXX: temporary exception handling to get better logs
+            try:
+                data_type.replaceAtOffset(
+                    component["offset"], ghidra_type, -1, component["name"], None
+                )
+            except Exception as e:
+                raise Exception(f"Error importing {type_in_pdb}") from e
+
+        logger.info("Finished importing class %s", class_name_with_namespace)
+
+        return data_type
+
+    def pdb_to_ghidra_type(self, type_index: str) -> DataType:
+        """
+        Experimental new type converter to get rid of the intermediate step PDB -> C++ -> Ghidra
+
+        @param type_index Either a scalar type like `T_INT4(...)` or a PDB reference like `0x10ba`
+        """
+        # scalar type
+        type_index_lower = type_index.lower()
+        if type_index_lower.startswith("t_"):
+            if (
+                match := self.extraction.scalar_type_regex.match(type_index_lower)
+            ) is None:
+                raise TypeNotFoundError(f"Type has unexpected format: {type_index}")
+
+            scalar_cpp_type = self.extraction.scalar_type_to_cpp(
+                match.group("typename")
+            )
+            return get_ghidra_type(self.api, scalar_cpp_type)
+
+        try:
+            type_pdb = self.extraction.compare.cv.types.keys[type_index_lower]
+        except KeyError as e:
+            raise TypeNotFoundError(
+                f"Failed to find referenced type {type_index_lower}"
+            ) from e
+
+        type_category = type_pdb["type"]
+
+        if type_category == "LF_POINTER":
+            return add_pointer_type(
+                self.api, self.pdb_to_ghidra_type(type_pdb["element_type"])
+            )
+
+        if type_category in ["LF_CLASS", "LF_STRUCTURE"]:
+            if type_pdb.get("is_forward_ref", False):
+                logger.debug(
+                    "Following forward reference from %s to %s",
+                    type_index,
+                    type_pdb["udt"],
+                )
+                return self.pdb_to_ghidra_type(type_pdb["udt"])
+
+            return self._import_class_or_struct(type_pdb)
+
+        if type_category == "LF_ARRAY":
+            # TODO: See how well this interacts with arrays in functions
+            # We treat arrays like pointers because we don't distinguish them in Ghidra
+            logger.debug("Encountered array: %s", type_pdb)
+            inner_type = self.pdb_to_ghidra_type(type_pdb["array_type"])
+
+            # TODO: Insert size / consider switching to pointer if not applicable
+            return ArrayDataType(inner_type, 0, 0)
+
+        if type_category == "LF_ENUM":
+            logger.warning(
+                "Replacing enum by underlying type (not implemented yet): %s", type_pdb
+            )
+            return self.pdb_to_ghidra_type(type_pdb["underlying_type"])
+
+        if type_category == "LF_MODIFIER":
+            logger.warning("Not sure what a modifier is: %s", type_pdb)
+            # not sure what this actually is, take what it references
+            return self.pdb_to_ghidra_type(type_pdb["modifies"])
+
+        if type_category == "LF_PROCEDURE":
+            logger.info(
+                "Function-valued argument or return type will be replaced by void pointer: %s",
+                type_pdb,
+            )
+            return get_ghidra_type(self.api, "void")
+
+        if type_category == "LF_UNION":
+            if type_pdb.get("is_forward_ref", False):
+                return self.pdb_to_ghidra_type(type_pdb["udt"])
+
+            try:
+                logger.debug("Dereferencing union %s", type_pdb)
+                union_type = get_ghidra_type(self.api, type_pdb["name"])
+                assert (
+                    union_type.getLength() == type_pdb["size"]
+                ), f"Wrong size of existing union type '{type_pdb['name']}': expected {type_pdb["size"]}, got {union_type.getLength()}"
+                return union_type
+            except TypeNotFoundInGhidraError as e:
+                raise TypeNotImplementedError(
+                    f"Writing union types is not supported. Please add by hand: {type_pdb}"
+                ) from e
+
+        raise TypeNotImplementedError(type_pdb)
diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py
index 0c84eb92..a06e623b 100644
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@@ -88,6 +88,11 @@ def __init__(
         self._match_thunks()
         self._find_vtordisp()
 
+    @property
+    def db(self):
+        """Newer code needs to access this field, legacy code uses _db"""
+        return self._db
+
     def _load_cvdump(self):
         logger.info("Parsing %s ...", self.pdb_file)
         self.cv = (
@@ -161,7 +166,10 @@ def _load_cvdump(self):
                 addr, sym.node_type, sym.name(), sym.decorated_name, sym.size()
             )
 
-        for (section, offset), (filename, line_no) in self.cvdump_analysis.verified_lines.items():
+        for (section, offset), (
+            filename,
+            line_no,
+        ) in self.cvdump_analysis.verified_lines.items():
             addr = self.recomp_bin.get_abs_addr(section, offset)
             self._lines_db.add_line(filename, line_no, addr)
 
diff --git a/tools/isledecomp/isledecomp/cvdump/types.py b/tools/isledecomp/isledecomp/cvdump/types.py
index 81eee5c2..9776cc39 100644
--- a/tools/isledecomp/isledecomp/cvdump/types.py
+++ b/tools/isledecomp/isledecomp/cvdump/types.py
@@ -216,6 +216,9 @@ class CvdumpTypesParser:
         re.compile(r"^\s*enum name = (?P<name>.+)$"),
         re.compile(r"^\s*UDT\((?P<udt>0x\w+)\)$"),
     ]
+    LF_UNION_LINE = re.compile(
+        r".*field list type (?P<field_type>0x\w+),.*Size = (?P<size>\d+)\s*,class name = (?P<name>(?:[^,]|,\S)+),\s.*UDT\((?P<udt>0x\w+)\)"
+    )
 
     MODES_OF_INTEREST = {
         "LF_ARRAY",
@@ -228,6 +231,7 @@ class CvdumpTypesParser:
         "LF_ARGLIST",
         "LF_MFUNCTION",
         "LF_PROCEDURE",
+        "LF_UNION",
     }
 
     def __init__(self) -> None:
@@ -298,7 +302,9 @@ def _mock_array_members(self, type_obj: Dict) -> List[FieldListItem]:
             raise CvdumpIntegrityError("No array element type")
 
         array_element_size = self.get(array_type).size
-        assert array_element_size is not None, "Encountered an array whose type has no size"
+        assert (
+            array_element_size is not None
+        ), "Encountered an array whose type has no size"
 
         n_elements = type_obj["size"] // array_element_size
 
@@ -399,7 +405,9 @@ def get_scalars_gapless(self, type_key: str) -> List[ScalarType]:
 
         obj = self.get(type_key)
         total_size = obj.size
-        assert total_size is not None, "Called get_scalar_gapless() on a type without size"
+        assert (
+            total_size is not None
+        ), "Called get_scalar_gapless() on a type without size"
 
         scalars = self.get_scalars(type_key)
 
@@ -506,6 +514,9 @@ def read_line(self, line: str):
         elif self.mode == "LF_ENUM":
             self.read_enum_line(line)
 
+        elif self.mode == "LF_UNION":
+            self.read_union_line(line)
+
         else:
             # Check for exhaustiveness
             logger.error("Unhandled data in mode: %s", self.mode)
@@ -610,3 +621,14 @@ def parse_enum_attribute(self, attribute: str) -> dict[str, Any]:
             return {"is_forward_ref": True}
         logger.error("Unknown attribute in enum: %s", attribute)
         return {}
+
+    def read_union_line(self, line: str):
+        """This is a rather barebones handler, only parsing the size"""
+        if (match := self.LF_UNION_LINE.match(line)) is None:
+            raise AssertionError(f"Unhandled in union: {line}")
+        self._set("name", match.group("name"))
+        if match.group("field_type") == "0x0000":
+            self._set("is_forward_ref", True)
+
+        self._set("size", int(match.group("size")))
+        self._set("udt", normalize_type_id(match.group("udt")))