Add draft for Ghidra function import script

This commit is contained in:
jonschz 2024-05-11 21:55:36 +02:00
parent 45f9f54f21
commit fd5e8f8d0c
8 changed files with 678 additions and 0 deletions

1
.gitignore vendored
View File

@ -19,3 +19,4 @@ LEGO1.DLL
LEGO1PROGRESS.* LEGO1PROGRESS.*
ISLEPROGRESS.* ISLEPROGRESS.*
*.pyc *.pyc
*$py.class

View File

@ -0,0 +1,12 @@
# Ghidra Scripts
## Setup
- In Ghidra, _Open Window -> Script Manager_.
- Click the _Manage Script Directories_ button on the top right.
- Click the _Add_ button and select this file's parent directory.
- Close the window and click the _Refresh_ button.
- This script should now be available under the folder _LEGO1_.
## Development
- Type hints for Ghira (optional): Download a recent release from https://github.com/VDOO-Connected-Trust/ghidra-pyi-generator,
unpack it somewhere, and `pip install` that directory in this virtual environment. This provides types and headers for Python.

View File

@ -0,0 +1,300 @@
# Synchronised the function signatures of LEGO1.dll to Ghidra.
# At startup there will be several prompts for different modes,
# including a read-only / dry run mode.
# @author J. Schulz
# @category LEGO1
# @keybinding
# @menupath
# @toolbar
# Disable spurious warnings in vscode / pylance
# pyright: reportMissingModuleSource=false
import sys
import os
import re
import traceback
import logging
from lego_util.cpp_parser import (
CppFunctionDeclaration,
function_regex,
class_regex,
struct_regex,
namespace_regex,
)
from lego_util.file_helper import iterate_dir
from lego_util.exceptions import (
Lego1Exception,
NamespaceNotFoundInGhidraError,
TypeNotFoundInGhidraError,
FunctionNotFoundInGhidraError,
)
# # no effect when no Ghidra is used
# READ_ONLY = False
# # READ_ONLY = True
# Type annotations are only available in Python 3.5 or later
if sys.version_info.major > 2:
from typing import TYPE_CHECKING, TypeVar
if TYPE_CHECKING:
from ghidra.program.model.address import Address, AddressFactory
from ghidra.program.model.listing import Program
from ghidra.program.model.data import DataType
from ghidra.program.model.symbol import Namespace
from ghidra.app.script import GhidraScript
from ghidra.app.script import GhidraState
# Global stubs, Python 2 and 3 compatible
def _get_state(): # type: () -> GhidraState
return None # type: ignore
state = _get_state()
def getDataTypes(name): # type: (str) -> list[DataType]
return # type: ignore
def getCurrentProgram(): # type: () -> Program
return # type: ignore
def getFunctionAt(entryPoint): # type: (Address) -> Function
return # type: ignore
def getAddressFactory(): # type: () -> AddressFactory
return # type: ignore
def getNamespace(parent, namespaceName): # type: (Namespace, str) -> Namespace
return # type: ignore
def askYesNo(title, message): # type: (str, str) -> bool
return # type: ignore
T = TypeVar("T")
def askChoice(
title, message, choices, defaultValue
): # type: (str, str, list[T], T) -> T
return # type: ignore
# This script can be run both from Ghidra and as a standalone.
# In the latter case, only the C++ parser can be used.
try:
from ghidra.program.model.listing import Function
from ghidra.program.flatapi import FlatProgramAPI
from lego_util.ghidra_helper import CppFunctionWithGhidraTypes
# This is needed for Ghidra API calls in submodules
API = FlatProgramAPI(state.getCurrentProgram())
MAKE_CHANGES = askYesNo(
"Make changes?", "Select 'Yes' to apply changes, select 'No' to do a dry run."
)
if MAKE_CHANGES:
PROMPT_BEFORE_CHANGE = askYesNo(
"Prompt before changes?", "Should each change be confirmed by a prompt?"
)
else:
# for the linter, has no effect anyway
PROMPT_BEFORE_CHANGE = True
RUNNING_FROM_GHIDRA = True
except ImportError:
RUNNING_FROM_GHIDRA = False
MAKE_CHANGES = False
CLASSES_AND_STRUCTS = set() # type: set[str]
NAMESPACES = set() # type: set[str]
SUCCESSES = 0
FAILURES = {} # type: dict[str, int]
KNOWN_MISSING_TYPES = {} # type: dict[str, int]
KNOWN_MISSING_NAMESPACES = set() # type: set[str]
FUNCTIONS_CHANGED = 0
def main():
logging.basicConfig(
format="%(levelname)-8s %(message)s", stream=sys.stdout, level=logging.INFO
)
if not RUNNING_FROM_GHIDRA:
logging.error(
"Failed to import Ghidra functions, doing a dry run for the source code parser. "
"Has this script been launched from Ghidra?"
)
# navigate to this repository's root and then down to the LEGO1 source
root_dir = os.path.join(os.path.dirname(__file__), "..", "..", "LEGO1")
try:
# Collect classes and structs first
iterate_dir(root_dir, search_for_classes_and_structs)
# Now do the real work
iterate_dir(root_dir, search_and_process_functions)
finally:
# output statistics even when aborting
missing_type_list = [
"%s (%d)" % entry
for entry in sorted(
KNOWN_MISSING_TYPES.items(), key=lambda x: x[1], reverse=True
)
]
logging.info(
"Missing types: (with number of occurences): %s",
", ".join(missing_type_list),
)
logging.info("Successes: %d", SUCCESSES)
logging.info("Failures: %s", FAILURES)
logging.info("Functions changed: %d", FUNCTIONS_CHANGED)
def log_and_track_failure(
file_path, error, unexpected=False
): # type: (str, Exception, bool) -> None
error_type_name = error.__class__.__name__
FAILURES[error_type_name] = FAILURES.setdefault(error_type_name, 0) + 1
if isinstance(error, TypeNotFoundInGhidraError):
missing_type = error.args[0]
current_count = KNOWN_MISSING_TYPES.setdefault(missing_type, 0)
KNOWN_MISSING_TYPES[missing_type] = current_count + 1
if current_count > 0:
# Log each missing type only once to reduce log noise
return
if isinstance(error, NamespaceNotFoundInGhidraError):
namespace = error.get_namespace_str()
if namespace in KNOWN_MISSING_NAMESPACES:
# Log each missing namespace only once to reduce log noise
return
KNOWN_MISSING_NAMESPACES.add(namespace)
logging.error(
"%s%s: %s",
"Unexpected error in " if unexpected else "",
os.path.basename(file_path),
error,
)
def handle_function(lines, startIndex, address): # type: (str, int, str) -> None
global FUNCTIONS_CHANGED
# Parse the C++ function
while re.match(r"\s*//", lines[startIndex:]):
startIndex = lines.find("\n", startIndex + 1)
cpp_function = CppFunctionDeclaration(lines, startIndex, CLASSES_AND_STRUCTS)
if cpp_function.return_type in CLASSES_AND_STRUCTS:
# edge case handling - Ghidra does not understand what happens under the hood.
# These must be set manually
logging.error(
"Unimplemented edge case at 0x%s: Return value is a non-referenced struct or class: %s",
address,
cpp_function,
)
return
if not RUNNING_FROM_GHIDRA:
return
# Find the Ghidra function at that address
ghidra_address = getAddressFactory().getAddress(address)
ghidra_function = getFunctionAt(ghidra_address)
if ghidra_function is None:
raise FunctionNotFoundInGhidraError(address)
# Convert the C++ data types to Ghidra data types
typed_cpp_function = CppFunctionWithGhidraTypes(API, cpp_function)
if typed_cpp_function.matches_ghidra_function(ghidra_function):
logging.debug(
"Skipping function '%s', matches already", cpp_function.full_name()
)
return
if not MAKE_CHANGES:
return
# Navigate Ghidra to the current function
state.setCurrentAddress(ghidra_address)
if PROMPT_BEFORE_CHANGE:
choice = askChoice(
"Change function?",
"Change to %s" % cpp_function,
["Yes", "No", "Abort"],
"Yes",
)
if choice == "No":
return
if choice != "Yes":
logging.critical("User quit, terminating")
raise SystemExit(1)
logging.info("Modifying function %s at 0x%s", cpp_function.full_name(), address)
typed_cpp_function.overwrite_ghidra_function(ghidra_function)
FUNCTIONS_CHANGED += 1
if PROMPT_BEFORE_CHANGE:
# Add a prompt so we can verify the result immediately
askChoice("", "Click 'OK' to continue", ["OK"], "OK")
def search_for_classes_and_structs(header_file): # type: (str) -> None
global CLASSES_AND_STRUCTS, NAMESPACES
if not (header_file.endswith(".h") or header_file.endswith(".cpp")):
return
try:
with open(header_file) as infile:
headers = infile.read()
except Exception:
logging.error(
"Error handling header file: %s\n%s", header_file, traceback.format_exc()
)
return
CLASSES_AND_STRUCTS = CLASSES_AND_STRUCTS.union(class_regex.findall(headers))
CLASSES_AND_STRUCTS = CLASSES_AND_STRUCTS.union(struct_regex.findall(headers))
NAMESPACES = NAMESPACES.union(namespace_regex.findall(headers))
def search_and_process_functions(path): # type: (str) -> None
global SUCCESSES
if not path.endswith(".cpp"):
return
with open(path, "r") as file:
lines = file.read()
# search for '// FUNCTION: LEGO1 0x[...]'
for match in function_regex.finditer(lines):
next_line_index = lines.find("\n", match.end()) + 1
try:
handle_function(lines, next_line_index, match.groups()[0])
SUCCESSES += 1
except Lego1Exception as e:
log_and_track_failure(path, e)
except Exception as e:
log_and_track_failure(path, e, unexpected=True)
logging.error(traceback.format_exc())
if __name__ == "__main__":
main()

View File

@ -0,0 +1,140 @@
import re
from lego_util.exceptions import (
UnsupportedCppSyntaxError,
CppUnknownClassOrNamespaceError,
)
function_regex = re.compile(r"\s*// FUNCTION: LEGO1 0x(\w{8})")
class_regex = re.compile(r"\n\s*class\s(\w+)")
struct_regex = re.compile(r"\n\s*struct\s(\w+)")
namespace_regex = re.compile(r"\n\s*namespace\s(\w+)")
class CppFunctionDeclaration:
"""
A rudimentary parser for C++ function signatures in LEGO1.
Assumes that the C++ code has been formatted to some degree.
"""
def __init__(
self, fn, start_index, classes_and_structs
): # type: (CppFunctionDeclaration, str, int, set[str]) -> None
first_part_str, second_part = self._split_off_declaration_and_arguments(
fn[start_index:]
)
try:
first_part = first_part_str.split(" ")
full_function_name = first_part.pop()
colon_split = full_function_name.split("::")
self.name = colon_split.pop()
self.namespace_hierachy = colon_split
if first_part:
while True:
# desired failure if we only get keywords and no return type
self.return_type = first_part.pop(0)
if self.return_type not in ["const", "inline"]:
break
else:
# most likely a constructor or destructor
assert self.namespace_hierachy is not None, (
"Unhandled function without return type or namespace: " + fn
)
if self.name.startswith("~"):
self.return_type = "void"
else:
self.return_type = self.name + "*"
# evaluate if we belong to a class, assume __thiscall
self.class_name = None
if self.namespace_hierachy:
bottom_level_namespace = self.namespace_hierachy[-1]
if bottom_level_namespace in classes_and_structs:
self.class_name = bottom_level_namespace
else:
raise CppUnknownClassOrNamespaceError(bottom_level_namespace)
# don't add a `this` argument, let Ghidra handle that
self.flags = first_part
if second_part.strip():
self.arguments = [
self._parse_argument(i, x)
for i, x in enumerate(second_part.split(","))
]
else:
self.arguments = []
except UnsupportedCppSyntaxError as e:
raise UnsupportedCppSyntaxError(
"%s. In: '%s(%s)'" % (e.args[0], first_part_str, second_part)
)
def __str__(self):
flags = " ".join(self.flags)
full_name = self.full_name()
args = ["%s %s" % pair for pair in self.arguments]
if self.class_name:
# add the "this" argument to the output
args = [("%s* this" % self.class_name)] + args
return "%s __thiscall %s%s(%s)" % (
self.return_type,
flags,
full_name,
", ".join(args),
)
return "%s %s%s(%s)" % (self.return_type, flags, full_name, ", ".join(args))
def full_name(self):
return "::".join(self.namespace_hierachy + [self.name])
def _parse_argument(
self, index, argument_str
): # type: (int, str) -> tuple[str, str]
"""Returns: (type, name)"""
# Cleanup, handle `const`
split = (x.strip() for x in argument_str.split(" "))
filtered = [x for x in split if len(x) > 0 and x.lower() != "const"]
if len(filtered) == 0:
raise UnsupportedCppSyntaxError(
"Expected more arguments: '%s'" % argument_str.strip()
)
if len(filtered) == 1:
# unnamed argument
return (filtered[0], "param%d" % (index + 1))
if len(filtered) == 2:
return (filtered[0], filtered[1])
raise UnsupportedCppSyntaxError(
"Unsupported argument syntax: '%s'" % argument_str.strip()
)
def _split_off_declaration_and_arguments(
self, fn
): # type: (str) -> tuple[str, str]
# handle `unsigned` in arguments and result
fn = fn.replace("unsigned ", "u")
first_paren = fn.find("(")
assert first_paren >= 0, "No opening parenthesis found in function '%s'" % fn
paren_stack = 1
close_paren = first_paren
while paren_stack > 0:
# In case of unmatched parentheses we run into an IndexError,
# which is expected behaviour
close_paren += 1
if fn[close_paren] == "(":
paren_stack += 1
elif fn[close_paren] == ")":
paren_stack -= 1
return (
fn[:first_paren].replace("\n", ""),
fn[first_paren + 1 : close_paren].replace("\n", ""),
)

View File

@ -0,0 +1,38 @@
class Lego1Exception(Exception):
pass
class TypeNotFoundInGhidraError(Lego1Exception):
def __str__(self):
return "Type not found in Ghidra: %s" % self.args[0]
class NamespaceNotFoundInGhidraError(Lego1Exception):
def __init__(self, namespaceHierachy): # type: (list[str]) -> None
super(NamespaceNotFoundInGhidraError, self).__init__(namespaceHierachy)
def get_namespace_str(self): # type: () -> str
return "::".join(self.args[0])
def __str__(self):
return "Class or namespace not found in Ghidra: %s" % self.get_namespace_str()
class FunctionNotFoundInGhidraError(Lego1Exception):
def __str__(self):
return "Function not found in Ghidra at %s" % self.args[0]
class MultipleTypesFoundInGhidraError(Lego1Exception):
def __str__(self):
return "Found multiple types matching '%s' in Ghidra: %s" % self.args
class UnsupportedCppSyntaxError(Lego1Exception):
def __str__(self):
return "C++ syntax currently not supported in the parser: %s" % self.args[0]
class CppUnknownClassOrNamespaceError(Lego1Exception):
def __str__(self):
return "'%s' is neither a known class nor namespace" % self.args[0]

View File

@ -0,0 +1,14 @@
import os
import sys
if sys.version_info.major > 2:
from typing import Callable
def iterate_dir(path, file_callback): # type: (str, Callable[[str], None]) -> None
for file_or_dir_name in os.listdir(path): # pathlib not supported
child_path = os.path.join(path, file_or_dir_name)
if os.path.isdir(child_path):
iterate_dir(child_path, file_callback)
else:
file_callback(child_path)

View File

@ -0,0 +1,173 @@
import logging
import sys
import re
from lego_util.exceptions import (
NamespaceNotFoundInGhidraError,
TypeNotFoundInGhidraError,
MultipleTypesFoundInGhidraError,
)
from lego_util.cpp_parser import CppFunctionDeclaration
# Disable spurious warnings in vscode / pylance
# pyright: reportMissingModuleSource=false
from ghidra.program.model.data import PointerDataType
from ghidra.program.model.data import DataTypeConflictHandler
from ghidra.program.model.listing import ParameterImpl
from ghidra.program.model.listing import Function
from ghidra.program.model.symbol import SourceType
# Type annotations are only available in Python 3.5 or later
if sys.version_info.major > 2:
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ghidra.program.flatapi import FlatProgramAPI
from ghidra.program.model.data import DataType
from ghidra.program.model.symbol import Namespace
from ghidra.program.model.listing import Parameter
def get_ghidra_type(api, type_name): # type: (FlatProgramAPI, str) -> DataType
"""
Searches for the type named `typeName` in Ghidra.
Raises:
NotFoundInGhidraError:
"""
# references to pointers
type_name = type_name.replace("&", " *")
# handle reference spacing (void* -> void *)
type_name = re.sub(r"(?<!\s)\*", " *", type_name)
result = api.getDataTypes(type_name)
if len(result) == 0:
if type_name.endswith("*"):
# Create a new pointer type if the dereferenced type exists
dereferenced_type = get_ghidra_type(api, type_name[0:-2])
return add_pointer_type(api, dereferenced_type)
raise TypeNotFoundInGhidraError(type_name)
if len(result) == 1:
return result[0]
raise MultipleTypesFoundInGhidraError(type_name, result)
def add_pointer_type(api, pointee): # type: (FlatProgramAPI, DataType) -> DataType
data_type = PointerDataType(pointee)
data_type.setCategoryPath(pointee.categoryPath)
api.getCurrentProgram().getDataTypeManager().addDataType(
data_type, DataTypeConflictHandler.KEEP_HANDLER
)
logging.info("Created new pointer type %s", data_type)
return data_type
def get_ghidra_namespace(
api, namespace_hierachy
): # type: (FlatProgramAPI, list[str]) -> Namespace
namespace = api.getCurrentProgram().getGlobalNamespace()
for part in namespace_hierachy:
namespace = api.getNamespace(namespace, part)
if namespace is None:
raise NamespaceNotFoundInGhidraError(namespace_hierachy)
return namespace
class CppFunctionWithGhidraTypes(object):
"""Collects the matching Ghidra entities for a C++ function declaration."""
def __init__(
self, fpapi, cpp_fn_decl
): # type: (FlatProgramAPI, CppFunctionDeclaration) -> None
self.name = cpp_fn_decl.name
self.class_name = cpp_fn_decl.class_name
self.return_type = get_ghidra_type(fpapi, cpp_fn_decl.return_type)
self.arguments = [
ParameterImpl(
name, get_ghidra_type(fpapi, type_name), fpapi.getCurrentProgram()
)
for (type_name, name) in cpp_fn_decl.arguments
]
self.namespace = get_ghidra_namespace(fpapi, cpp_fn_decl.namespace_hierachy)
def matches_ghidra_function(self, ghidra_function): # type: (Function) -> bool
"""Checks whether this function declaration already matches the description in Ghidra"""
name_match = self.name == ghidra_function.getName(False)
namespace_match = self.namespace == ghidra_function.getParentNamespace()
return_type_match = self.return_type == ghidra_function.getReturnType()
# match arguments: decide if thiscall or not
thiscall_matches = (self.class_name is not None) == (
ghidra_function.getCallingConventionName() == "__thiscall"
)
if thiscall_matches:
if self.class_name is not None:
args_match = self._matches_thiscall_parameters(ghidra_function)
else:
args_match = self._matches_non_thiscall_parameters(ghidra_function)
else:
args_match = False
logging.debug(
"Matches: namespace=%s name=%s return_type=%s thiscall=%s args=%s",
namespace_match,
name_match,
return_type_match,
thiscall_matches,
args_match,
)
return (
name_match
and namespace_match
and return_type_match
and thiscall_matches
and args_match
)
def _matches_non_thiscall_parameters(
self, ghidra_function
): # type: (Function) -> bool
return self._parameter_lists_match(ghidra_function.getParameters())
def _matches_thiscall_parameters(self, ghidra_function): # type: (Function) -> bool
ghidra_params = ghidra_function.getParameters() # type: list[Parameter]
# remove the `this` argument which we don't generate ourselves
ghidra_params.pop(0)
return self._parameter_lists_match(ghidra_params)
def _parameter_lists_match(self, ghidra_params): # type: (list[Parameter]) -> bool
if len(self.arguments) != len(ghidra_params):
return False
for this_arg, ghidra_arg in zip(self.arguments, ghidra_params):
if (
this_arg.getName() != ghidra_arg.getName()
or this_arg.getDataType() != ghidra_arg.getDataType()
):
return False
return True
def overwrite_ghidra_function(self, ghidra_function): # type: (Function) -> None
"""Replace the function declaration in Ghidra by the one derived from C++."""
ghidra_function.setName(self.name, SourceType.USER_DEFINED)
ghidra_function.setParentNamespace(self.namespace)
ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED)
# not sure what calling convention to choose when it's not a __thiscall,
# so we play it safe and keep whatever Ghidra has
if self.class_name:
ghidra_function.setCallingConvention("__thiscall")
ghidra_function.replaceParameters(
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,
True,
SourceType.USER_DEFINED,
self.arguments,
)