const Object = @This(); const std = @import("std"); const build_options = @import("build_options"); const assert = std.debug.assert; const fs = std.fs; const io = std.io; const log = std.log.scoped(.link); const macho = std.macho; const math = std.math; const mem = std.mem; const sort = std.sort; const trace = @import("../../tracy.zig").trace; const Allocator = mem.Allocator; const Atom = @import("Atom.zig"); const MachO = @import("../MachO.zig"); const MatchingSection = MachO.MatchingSection; const SymbolWithLoc = MachO.SymbolWithLoc; file: fs.File, name: []const u8, mtime: u64, /// Data contents of the file. Includes sections, and data of load commands. /// Excludes the backing memory for the header and load commands. /// Initialized in `parse`. contents: []const u8 = undefined, file_offset: ?u32 = null, header: macho.mach_header_64 = undefined, load_commands: std.ArrayListUnmanaged(macho.LoadCommand) = .{}, segment_cmd_index: ?u16 = null, text_section_index: ?u16 = null, symtab_cmd_index: ?u16 = null, dysymtab_cmd_index: ?u16 = null, build_version_cmd_index: ?u16 = null, data_in_code_cmd_index: ?u16 = null, // __DWARF segment sections dwarf_debug_info_index: ?u16 = null, dwarf_debug_abbrev_index: ?u16 = null, dwarf_debug_str_index: ?u16 = null, dwarf_debug_line_index: ?u16 = null, dwarf_debug_line_str_index: ?u16 = null, dwarf_debug_ranges_index: ?u16 = null, symtab: std.ArrayListUnmanaged(macho.nlist_64) = .{}, strtab: []const u8 = &.{}, data_in_code_entries: []const macho.data_in_code_entry = &.{}, sections_as_symbols: std.AutoHashMapUnmanaged(u16, u32) = .{}, /// List of atoms that map to the symbols parsed from this object file. managed_atoms: std.ArrayListUnmanaged(*Atom) = .{}, /// Table of atoms belonging to this object file indexed by the symbol index. atom_by_index_table: std.AutoHashMapUnmanaged(u32, *Atom) = .{}, pub fn deinit(self: *Object, gpa: Allocator) void { for (self.load_commands.items) |*lc| { lc.deinit(gpa); } self.load_commands.deinit(gpa); gpa.free(self.contents); self.symtab.deinit(gpa); self.sections_as_symbols.deinit(gpa); self.atom_by_index_table.deinit(gpa); for (self.managed_atoms.items) |atom| { atom.deinit(gpa); gpa.destroy(atom); } self.managed_atoms.deinit(gpa); gpa.free(self.name); } pub fn parse(self: *Object, allocator: Allocator, cpu_arch: std.Target.Cpu.Arch) !void { const file_stat = try self.file.stat(); const file_size = math.cast(usize, file_stat.size) orelse return error.Overflow; self.contents = try self.file.readToEndAlloc(allocator, file_size); var stream = std.io.fixedBufferStream(self.contents); const reader = stream.reader(); const file_offset = self.file_offset orelse 0; if (file_offset > 0) { try reader.context.seekTo(file_offset); } self.header = try reader.readStruct(macho.mach_header_64); if (self.header.filetype != macho.MH_OBJECT) { log.debug("invalid filetype: expected 0x{x}, found 0x{x}", .{ macho.MH_OBJECT, self.header.filetype, }); return error.NotObject; } const this_arch: std.Target.Cpu.Arch = switch (self.header.cputype) { macho.CPU_TYPE_ARM64 => .aarch64, macho.CPU_TYPE_X86_64 => .x86_64, else => |value| { log.err("unsupported cpu architecture 0x{x}", .{value}); return error.UnsupportedCpuArchitecture; }, }; if (this_arch != cpu_arch) { log.err("mismatched cpu architecture: expected {s}, found {s}", .{ cpu_arch, this_arch }); return error.MismatchedCpuArchitecture; } try self.load_commands.ensureUnusedCapacity(allocator, self.header.ncmds); var i: u16 = 0; while (i < self.header.ncmds) : (i += 1) { var cmd = try macho.LoadCommand.read(allocator, reader); switch (cmd.cmd()) { .SEGMENT_64 => { self.segment_cmd_index = i; var seg = cmd.segment; for (seg.sections.items) |*sect, j| { const index = @intCast(u16, j); const segname = sect.segName(); const sectname = sect.sectName(); if (mem.eql(u8, segname, "__DWARF")) { if (mem.eql(u8, sectname, "__debug_info")) { self.dwarf_debug_info_index = index; } else if (mem.eql(u8, sectname, "__debug_abbrev")) { self.dwarf_debug_abbrev_index = index; } else if (mem.eql(u8, sectname, "__debug_str")) { self.dwarf_debug_str_index = index; } else if (mem.eql(u8, sectname, "__debug_line")) { self.dwarf_debug_line_index = index; } else if (mem.eql(u8, sectname, "__debug_line_str")) { self.dwarf_debug_line_str_index = index; } else if (mem.eql(u8, sectname, "__debug_ranges")) { self.dwarf_debug_ranges_index = index; } } else if (mem.eql(u8, segname, "__TEXT")) { if (mem.eql(u8, sectname, "__text")) { self.text_section_index = index; } } sect.offset += file_offset; if (sect.reloff > 0) { sect.reloff += file_offset; } } seg.inner.fileoff += file_offset; }, .SYMTAB => { self.symtab_cmd_index = i; cmd.symtab.symoff += file_offset; cmd.symtab.stroff += file_offset; }, .DYSYMTAB => { self.dysymtab_cmd_index = i; }, .BUILD_VERSION => { self.build_version_cmd_index = i; }, .DATA_IN_CODE => { self.data_in_code_cmd_index = i; cmd.linkedit_data.dataoff += file_offset; }, else => { log.debug("Unknown load command detected: 0x{x}.", .{cmd.cmd()}); }, } self.load_commands.appendAssumeCapacity(cmd); } try self.parseSymtab(allocator); } const Context = struct { symtab: []const macho.nlist_64, strtab: []const u8, }; const SymbolAtIndex = struct { index: u32, fn getSymbol(self: SymbolAtIndex, ctx: Context) macho.nlist_64 { return ctx.symtab[self.index]; } fn getSymbolName(self: SymbolAtIndex, ctx: Context) []const u8 { const sym = self.getSymbol(ctx); assert(sym.n_strx < ctx.strtab.len); return mem.sliceTo(@ptrCast([*:0]const u8, ctx.strtab.ptr + sym.n_strx), 0); } /// Returns whether lhs is less than rhs by allocated address in object file. /// Undefined symbols are pushed to the back (always evaluate to true). fn lessThan(ctx: Context, lhs_index: SymbolAtIndex, rhs_index: SymbolAtIndex) bool { const lhs = lhs_index.getSymbol(ctx); const rhs = rhs_index.getSymbol(ctx); if (lhs.sect()) { if (rhs.sect()) { // Same group, sort by address. return lhs.n_value < rhs.n_value; } else { return true; } } else { return false; } } /// Returns whether lhs is less senior than rhs. The rules are: /// 1. ext /// 2. weak /// 3. local /// 4. temp (local starting with `l` prefix). fn lessThanBySeniority(ctx: Context, lhs_index: SymbolAtIndex, rhs_index: SymbolAtIndex) bool { const lhs = lhs_index.getSymbol(ctx); const rhs = rhs_index.getSymbol(ctx); if (!rhs.ext()) { const lhs_name = lhs_index.getSymbolName(ctx); return mem.startsWith(u8, lhs_name, "l") or mem.startsWith(u8, lhs_name, "L"); } else if (rhs.pext() or rhs.weakDef()) { return !lhs.ext(); } else { return false; } } /// Like lessThanBySeniority but negated. fn greaterThanBySeniority(ctx: Context, lhs_index: SymbolAtIndex, rhs_index: SymbolAtIndex) bool { return !lessThanBySeniority(ctx, lhs_index, rhs_index); } }; fn filterSymbolsByAddress( indexes: []SymbolAtIndex, start_addr: u64, end_addr: u64, ctx: Context, ) []SymbolAtIndex { const Predicate = struct { addr: u64, ctx: Context, pub fn predicate(pred: @This(), index: SymbolAtIndex) bool { return index.getSymbol(pred.ctx).n_value >= pred.addr; } }; const start = MachO.findFirst(SymbolAtIndex, indexes, 0, Predicate{ .addr = start_addr, .ctx = ctx, }); const end = MachO.findFirst(SymbolAtIndex, indexes, start, Predicate{ .addr = end_addr, .ctx = ctx, }); return indexes[start..end]; } fn filterRelocs( relocs: []const macho.relocation_info, start_addr: u64, end_addr: u64, ) []const macho.relocation_info { const Predicate = struct { addr: u64, pub fn predicate(self: @This(), rel: macho.relocation_info) bool { return rel.r_address < self.addr; } }; const start = MachO.findFirst(macho.relocation_info, relocs, 0, Predicate{ .addr = end_addr }); const end = MachO.findFirst(macho.relocation_info, relocs, start, Predicate{ .addr = start_addr }); return relocs[start..end]; } /// Splits object into atoms assuming one-shot linking mode. pub fn splitIntoAtomsOneShot(self: *Object, macho_file: *MachO, object_id: u32) !void { assert(macho_file.mode == .one_shot); const tracy = trace(@src()); defer tracy.end(); const gpa = macho_file.base.allocator; const seg = self.load_commands.items[self.segment_cmd_index.?].segment; log.debug("splitting object({d}, {s}) into atoms: one-shot mode", .{ object_id, self.name }); // You would expect that the symbol table is at least pre-sorted based on symbol's type: // local < extern defined < undefined. Unfortunately, this is not guaranteed! For instance, // the GO compiler does not necessarily respect that therefore we sort immediately by type // and address within. const context = Context{ .symtab = self.getSourceSymtab(), .strtab = self.strtab, }; var sorted_all_syms = try std.ArrayList(SymbolAtIndex).initCapacity(gpa, context.symtab.len); defer sorted_all_syms.deinit(); for (context.symtab) |_, index| { sorted_all_syms.appendAssumeCapacity(.{ .index = @intCast(u32, index) }); } // We sort by type: defined < undefined, and // afterwards by address in each group. Normally, dysymtab should // be enough to guarantee the sort, but turns out not every compiler // is kind enough to specify the symbols in the correct order. sort.sort(SymbolAtIndex, sorted_all_syms.items, context, SymbolAtIndex.lessThan); // Well, shit, sometimes compilers skip the dysymtab load command altogether, meaning we // have to infer the start of undef section in the symtab ourselves. const iundefsym = if (self.dysymtab_cmd_index) |cmd_index| blk: { const dysymtab = self.load_commands.items[cmd_index].dysymtab; break :blk dysymtab.iundefsym; } else blk: { var iundefsym: usize = sorted_all_syms.items.len; while (iundefsym > 0) : (iundefsym -= 1) { const sym = sorted_all_syms.items[iundefsym - 1].getSymbol(context); if (sym.sect()) break; } break :blk iundefsym; }; // We only care about defined symbols, so filter every other out. const sorted_syms = sorted_all_syms.items[0..iundefsym]; const subsections_via_symbols = self.header.flags & macho.MH_SUBSECTIONS_VIA_SYMBOLS != 0; for (seg.sections.items) |sect, id| { const sect_id = @intCast(u8, id); log.debug("splitting section '{s},{s}' into atoms", .{ sect.segName(), sect.sectName() }); // Get matching segment/section in the final artifact. const match = (try macho_file.getMatchingSection(sect)) orelse { log.debug(" unhandled section", .{}); continue; }; log.debug(" output sect({d}, '{s},{s}')", .{ macho_file.getSectionOrdinal(match), macho_file.getSection(match).segName(), macho_file.getSection(match).sectName(), }); const cpu_arch = macho_file.base.options.target.cpu.arch; const is_zerofill = blk: { const section_type = sect.type_(); break :blk section_type == macho.S_ZEROFILL or section_type == macho.S_THREAD_LOCAL_ZEROFILL; }; // Read section's code const code: ?[]const u8 = if (!is_zerofill) try self.getSectionContents(sect_id) else null; // Read section's list of relocations const raw_relocs = self.contents[sect.reloff..][0 .. sect.nreloc * @sizeOf(macho.relocation_info)]; const relocs = mem.bytesAsSlice( macho.relocation_info, @alignCast(@alignOf(macho.relocation_info), raw_relocs), ); // Symbols within this section only. const filtered_syms = filterSymbolsByAddress( sorted_syms, sect.addr, sect.addr + sect.size, context, ); if (subsections_via_symbols and filtered_syms.len > 0) { // If the first nlist does not match the start of the section, // then we need to encapsulate the memory range [section start, first symbol) // as a temporary symbol and insert the matching Atom. const first_sym = filtered_syms[0].getSymbol(context); if (first_sym.n_value > sect.addr) { const sym_index = self.sections_as_symbols.get(sect_id) orelse blk: { const sym_index = @intCast(u32, self.symtab.items.len); try self.symtab.append(gpa, .{ .n_strx = 0, .n_type = macho.N_SECT, .n_sect = macho_file.getSectionOrdinal(match), .n_desc = 0, .n_value = sect.addr, }); try self.sections_as_symbols.putNoClobber(gpa, sect_id, sym_index); break :blk sym_index; }; const atom_size = first_sym.n_value - sect.addr; const atom_code: ?[]const u8 = if (code) |cc| blk: { const size = math.cast(usize, atom_size) orelse return error.Overflow; break :blk cc[0..size]; } else null; const atom = try self.createAtomFromSubsection( macho_file, object_id, sym_index, atom_size, sect.@"align", atom_code, relocs, &.{}, match, sect, ); try macho_file.addAtomToSection(atom, match); } var next_sym_count: usize = 0; while (next_sym_count < filtered_syms.len) { const next_sym = filtered_syms[next_sym_count].getSymbol(context); const addr = next_sym.n_value; const atom_syms = filterSymbolsByAddress( filtered_syms[next_sym_count..], addr, addr + 1, context, ); next_sym_count += atom_syms.len; // We want to bubble up the first externally defined symbol here. assert(atom_syms.len > 0); var sorted_atom_syms = std.ArrayList(SymbolAtIndex).init(gpa); defer sorted_atom_syms.deinit(); try sorted_atom_syms.appendSlice(atom_syms); sort.sort( SymbolAtIndex, sorted_atom_syms.items, context, SymbolAtIndex.greaterThanBySeniority, ); const atom_size = blk: { const end_addr = if (next_sym_count < filtered_syms.len) filtered_syms[next_sym_count].getSymbol(context).n_value else sect.addr + sect.size; break :blk end_addr - addr; }; const atom_code: ?[]const u8 = if (code) |cc| blk: { const start = math.cast(usize, addr - sect.addr) orelse return error.Overflow; const size = math.cast(usize, atom_size) orelse return error.Overflow; break :blk cc[start..][0..size]; } else null; const atom_align = if (addr > 0) math.min(@ctz(u64, addr), sect.@"align") else sect.@"align"; const atom = try self.createAtomFromSubsection( macho_file, object_id, sorted_atom_syms.items[0].index, atom_size, atom_align, atom_code, relocs, sorted_atom_syms.items[1..], match, sect, ); if (cpu_arch == .x86_64 and addr == sect.addr) { // In x86_64 relocs, it can so happen that the compiler refers to the same // atom by both the actual assigned symbol and the start of the section. In this // case, we need to link the two together so add an alias. const alias = self.sections_as_symbols.get(sect_id) orelse blk: { const alias = @intCast(u32, self.symtab.items.len); try self.symtab.append(gpa, .{ .n_strx = 0, .n_type = macho.N_SECT, .n_sect = macho_file.getSectionOrdinal(match), .n_desc = 0, .n_value = addr, }); try self.sections_as_symbols.putNoClobber(gpa, sect_id, alias); break :blk alias; }; try atom.contained.append(gpa, .{ .sym_index = alias, .offset = 0, }); try self.atom_by_index_table.put(gpa, alias, atom); } try macho_file.addAtomToSection(atom, match); } } else { // If there is no symbol to refer to this atom, we create // a temp one, unless we already did that when working out the relocations // of other atoms. const sym_index = self.sections_as_symbols.get(sect_id) orelse blk: { const sym_index = @intCast(u32, self.symtab.items.len); try self.symtab.append(gpa, .{ .n_strx = 0, .n_type = macho.N_SECT, .n_sect = macho_file.getSectionOrdinal(match), .n_desc = 0, .n_value = sect.addr, }); try self.sections_as_symbols.putNoClobber(gpa, sect_id, sym_index); break :blk sym_index; }; const atom = try self.createAtomFromSubsection( macho_file, object_id, sym_index, sect.size, sect.@"align", code, relocs, filtered_syms, match, sect, ); try macho_file.addAtomToSection(atom, match); } } } fn createAtomFromSubsection( self: *Object, macho_file: *MachO, object_id: u32, sym_index: u32, size: u64, alignment: u32, code: ?[]const u8, relocs: []const macho.relocation_info, indexes: []const SymbolAtIndex, match: MatchingSection, sect: macho.section_64, ) !*Atom { const gpa = macho_file.base.allocator; const sym = self.symtab.items[sym_index]; const atom = try MachO.createEmptyAtom(gpa, sym_index, size, alignment); atom.file = object_id; self.symtab.items[sym_index].n_sect = macho_file.getSectionOrdinal(match); log.debug("creating ATOM(%{d}, '{s}') in sect({d}, '{s},{s}') in object({d})", .{ sym_index, self.getString(sym.n_strx), macho_file.getSectionOrdinal(match), macho_file.getSection(match).segName(), macho_file.getSection(match).sectName(), object_id, }); try self.atom_by_index_table.putNoClobber(gpa, sym_index, atom); try self.managed_atoms.append(gpa, atom); if (code) |cc| { assert(size == cc.len); mem.copy(u8, atom.code.items, cc); } const base_offset = sym.n_value - sect.addr; const filtered_relocs = filterRelocs(relocs, base_offset, base_offset + size); try atom.parseRelocs(filtered_relocs, .{ .macho_file = macho_file, .base_addr = sect.addr, .base_offset = @intCast(i32, base_offset), }); // Since this is atom gets a helper local temporary symbol that didn't exist // in the object file which encompasses the entire section, we need traverse // the filtered symbols and note which symbol is contained within so that // we can properly allocate addresses down the line. // While we're at it, we need to update segment,section mapping of each symbol too. try atom.contained.ensureTotalCapacity(gpa, indexes.len); for (indexes) |inner_sym_index| { const inner_sym = &self.symtab.items[inner_sym_index.index]; inner_sym.n_sect = macho_file.getSectionOrdinal(match); atom.contained.appendAssumeCapacity(.{ .sym_index = inner_sym_index.index, .offset = inner_sym.n_value - sym.n_value, }); try self.atom_by_index_table.putNoClobber(gpa, inner_sym_index.index, atom); } return atom; } fn parseSymtab(self: *Object, allocator: Allocator) !void { const index = self.symtab_cmd_index orelse return; const symtab = self.load_commands.items[index].symtab; try self.symtab.appendSlice(allocator, self.getSourceSymtab()); self.strtab = self.contents[symtab.stroff..][0..symtab.strsize]; } pub fn getSourceSymtab(self: Object) []const macho.nlist_64 { const index = self.symtab_cmd_index orelse return &[0]macho.nlist_64{}; const symtab = self.load_commands.items[index].symtab; const symtab_size = @sizeOf(macho.nlist_64) * symtab.nsyms; const raw_symtab = self.contents[symtab.symoff..][0..symtab_size]; return mem.bytesAsSlice( macho.nlist_64, @alignCast(@alignOf(macho.nlist_64), raw_symtab), ); } pub fn getSourceSymbol(self: Object, index: u32) ?macho.nlist_64 { const symtab = self.getSourceSymtab(); if (index >= symtab.len) return null; return symtab[index]; } pub fn getSourceSection(self: Object, index: u16) macho.section_64 { const seg = self.load_commands.items[self.segment_cmd_index.?].segment; assert(index < seg.sections.items.len); return seg.sections.items[index]; } pub fn parseDataInCode(self: Object) ?[]const macho.data_in_code_entry { const index = self.data_in_code_cmd_index orelse return null; const data_in_code = self.load_commands.items[index].linkedit_data; const raw_dice = self.contents[data_in_code.dataoff..][0..data_in_code.datasize]; return mem.bytesAsSlice( macho.data_in_code_entry, @alignCast(@alignOf(macho.data_in_code_entry), raw_dice), ); } pub fn getSectionContents(self: Object, index: u16) error{Overflow}![]const u8 { const sect = self.getSourceSection(index); const size = math.cast(usize, sect.size) orelse return error.Overflow; log.debug("getting {s},{s} data at 0x{x} - 0x{x}", .{ sect.segName(), sect.sectName(), sect.offset, sect.offset + sect.size, }); return self.contents[sect.offset..][0..size]; } pub fn getString(self: Object, off: u32) []const u8 { assert(off < self.strtab.len); return mem.sliceTo(@ptrCast([*:0]const u8, self.strtab.ptr + off), 0); } pub fn getAtomForSymbol(self: Object, sym_index: u32) ?*Atom { return self.atom_by_index_table.get(sym_index); }