From 769750a5f3b591ab47f26a088bd0266c256f0a49 Mon Sep 17 00:00:00 2001 From: disinvite Date: Mon, 29 Jan 2024 23:48:37 -0500 Subject: [PATCH] Sort by avg address with outliers removed --- tools/roadmap/roadmap.py | 55 ++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/tools/roadmap/roadmap.py b/tools/roadmap/roadmap.py index 1d91c91a..de1b2c10 100644 --- a/tools/roadmap/roadmap.py +++ b/tools/roadmap/roadmap.py @@ -5,7 +5,8 @@ import os import argparse import logging -from typing import List, Optional +import statistics +from typing import Iterator, List, Optional, Tuple from collections import namedtuple from isledecomp import Bin as IsleBin from isledecomp.cvdump import Cvdump @@ -90,6 +91,19 @@ def truncate_module_name(prefix: str, module: str) -> str: return module +def avg_remove_outliers(entries: List[int]) -> int: + """Compute the average from this list of entries (addresses) + after removing outlier values.""" + + if len(entries) == 1: + return entries[0] + + avg = statistics.mean(entries) + sd = statistics.pstdev(entries) + + return int(statistics.mean([e for e in entries if abs(e - avg) <= 2 * sd])) + + RoadmapRow = namedtuple( "RoadmapRow", [ @@ -110,10 +124,17 @@ class DeltaCollector: """Reads each row of the results and aggregates information about the placement of each module.""" - def __init__(self, match_type: str = "fun"): + def __init__(self, match_type: str = "fun") -> None: + # The displacement for each symbol from each module self.disp_map = {} + + # Each address for each module + self.addresses = {} + + # The earliest address for each module self.earliest = {} - self.seen = set() + + # String abbreviation for which symbol type we are checking self.match_type = "fun" match_type = str(match_type).strip().lower()[:3] @@ -124,11 +145,15 @@ def read_row(self, row: RoadmapRow): if row.module is None: return - self.seen.add(row.module) if row.sym_type != self.match_type: return if row.orig_addr is not None: + if row.module not in self.addresses: + self.addresses[row.module] = [] + + self.addresses[row.module].append(row.orig_addr) + if row.orig_addr < self.earliest.get(row.module, 0xFFFFFFFFF): self.earliest[row.module] = row.orig_addr @@ -138,6 +163,15 @@ def read_row(self, row: RoadmapRow): self.disp_map[row.module].append(row.displacement) + def iter_sorted(self) -> Iterator[Tuple[int, int]]: + """Compute the average address for each module, then generate them + in ascending order.""" + avg_address = { + mod: avg_remove_outliers(values) for mod, values in self.addresses.items() + } + for mod, avg in sorted(avg_address.items(), key=lambda x: x[1]): + yield (avg, mod) + def suggest_order(results: List[RoadmapRow], cmake_modules: List[str], match_type: str): """Suggest the order of modules for CMakeLists.txt""" @@ -156,14 +190,12 @@ def suggest_order(results: List[RoadmapRow], cmake_modules: List[str], match_typ set("/".join(mod.split("/", 2)[:2]) + "/" for mod in leftover_modules) ) - # These may already be sorted by earliest, but make sure - first_function = [(earliest, module) for (module, earliest) in dc.earliest.items()] - first_function.sort() + computed_order = list(dc.iter_sorted()) for prefix in cmake_prefixes: print(prefix) - # Show modules ordered by the first appearance of whichever symbol type. - for start, module in first_function: + # Show modules ordered by the computed average of addresses + for _, module in computed_order: if not module.startswith(prefix): continue @@ -172,10 +204,11 @@ def suggest_order(results: List[RoadmapRow], cmake_modules: List[str], match_typ avg_displacement = None displacements = dc.disp_map.get(module) if displacements is not None and len(displacements) > 0: - avg_displacement = int(sum(displacements) / len(displacements)) + avg_displacement = int(statistics.mean(displacements)) + earliest = dc.earliest.get(module) code_file = truncate_module_name(prefix, module) - print(f"0x{start:08x} {or_blank(avg_displacement):10} {code_file}") + print(f"0x{earliest:08x} {avg_displacement:10} {code_file}") # These modules are included in the final binary (in some form) but # they are not represented by whichever type of symbol we were checking.