zig/src/link/MachO/Object.zig

const Object = @This();

const std = @import("std");
const build_options = @import("build_options");
const assert = std.debug.assert;
const fs = std.fs;
const io = std.io;
const log = std.log.scoped(.link);
const macho = std.macho;
const math = std.math;
const mem = std.mem;
const sort = std.sort;
const trace = @import("../../tracy.zig").trace;

const Allocator = mem.Allocator;
const Atom = @import("Atom.zig");
const MachO = @import("../MachO.zig");
const MatchingSection = MachO.MatchingSection;
const SymbolWithLoc = MachO.SymbolWithLoc;

file: fs.File,
name: []const u8,
mtime: u64,

/// Data contents of the file. Includes sections, and data of load commands.
/// Excludes the backing memory for the header and load commands.
/// Initialized in `parse`.
contents: []const u8 = undefined,

file_offset: ?u32 = null,

header: macho.mach_header_64 = undefined,

load_commands: std.ArrayListUnmanaged(macho.LoadCommand) = .{},

segment_cmd_index: ?u16 = null,
text_section_index: ?u16 = null,
symtab_cmd_index: ?u16 = null,
dysymtab_cmd_index: ?u16 = null,
build_version_cmd_index: ?u16 = null,
data_in_code_cmd_index: ?u16 = null,

// __DWARF segment sections
dwarf_debug_info_index: ?u16 = null,
dwarf_debug_abbrev_index: ?u16 = null,
dwarf_debug_str_index: ?u16 = null,
dwarf_debug_line_index: ?u16 = null,
dwarf_debug_line_str_index: ?u16 = null,
dwarf_debug_ranges_index: ?u16 = null,

symtab: std.ArrayListUnmanaged(macho.nlist_64) = .{},
strtab: []const u8 = &.{},
data_in_code_entries: []const macho.data_in_code_entry = &.{},

sections_as_symbols: std.AutoHashMapUnmanaged(u16, u32) = .{},

/// List of atoms that map to the symbols parsed from this object file.
managed_atoms: std.ArrayListUnmanaged(*Atom) = .{},

/// Table of atoms belonging to this object file indexed by the symbol index.
atom_by_index_table: std.AutoHashMapUnmanaged(u32, *Atom) = .{},

pub fn deinit(self: *Object, gpa: Allocator) void {
    for (self.load_commands.items) |*lc| {
        lc.deinit(gpa);
    }
    self.load_commands.deinit(gpa);
    gpa.free(self.contents);
    self.symtab.deinit(gpa);
    self.sections_as_symbols.deinit(gpa);
    self.atom_by_index_table.deinit(gpa);

    for (self.managed_atoms.items) |atom| {
        atom.deinit(gpa);
        gpa.destroy(atom);
    }
    self.managed_atoms.deinit(gpa);

    gpa.free(self.name);
}

pub fn parse(self: *Object, allocator: Allocator, cpu_arch: std.Target.Cpu.Arch) !void {
    const file_stat = try self.file.stat();
    const file_size = math.cast(usize, file_stat.size) orelse return error.Overflow;
    self.contents = try self.file.readToEndAlloc(allocator, file_size);

    var stream = std.io.fixedBufferStream(self.contents);
    const reader = stream.reader();

    const file_offset = self.file_offset orelse 0;
    if (file_offset > 0) {
        try reader.context.seekTo(file_offset);
    }

    self.header = try reader.readStruct(macho.mach_header_64);
    if (self.header.filetype != macho.MH_OBJECT) {
        log.debug("invalid filetype: expected 0x{x}, found 0x{x}", .{
            macho.MH_OBJECT,
            self.header.filetype,
        });
        return error.NotObject;
    }

    const this_arch: std.Target.Cpu.Arch = switch (self.header.cputype) {
        macho.CPU_TYPE_ARM64 => .aarch64,
        macho.CPU_TYPE_X86_64 => .x86_64,
        else => |value| {
            log.err("unsupported cpu architecture 0x{x}", .{value});
            return error.UnsupportedCpuArchitecture;
        },
    };
    if (this_arch != cpu_arch) {
        log.err("mismatched cpu architecture: expected {s}, found {s}", .{ cpu_arch, this_arch });
        return error.MismatchedCpuArchitecture;
    }

    try self.load_commands.ensureUnusedCapacity(allocator, self.header.ncmds);

    var i: u16 = 0;
    while (i < self.header.ncmds) : (i += 1) {
        var cmd = try macho.LoadCommand.read(allocator, reader);
        switch (cmd.cmd()) {
            .SEGMENT_64 => {
                self.segment_cmd_index = i;
                var seg = cmd.segment;
                for (seg.sections.items) |*sect, j| {
                    const index = @intCast(u16, j);
                    const segname = sect.segName();
                    const sectname = sect.sectName();
                    if (mem.eql(u8, segname, "__DWARF")) {
                        if (mem.eql(u8, sectname, "__debug_info")) {
                            self.dwarf_debug_info_index = index;
                        } else if (mem.eql(u8, sectname, "__debug_abbrev")) {
                            self.dwarf_debug_abbrev_index = index;
                        } else if (mem.eql(u8, sectname, "__debug_str")) {
                            self.dwarf_debug_str_index = index;
                        } else if (mem.eql(u8, sectname, "__debug_line")) {
                            self.dwarf_debug_line_index = index;
                        } else if (mem.eql(u8, sectname, "__debug_line_str")) {
                            self.dwarf_debug_line_str_index = index;
                        } else if (mem.eql(u8, sectname, "__debug_ranges")) {
                            self.dwarf_debug_ranges_index = index;
                        }
                    } else if (mem.eql(u8, segname, "__TEXT")) {
                        if (mem.eql(u8, sectname, "__text")) {
                            self.text_section_index = index;
                        }
                    }

                    sect.offset += file_offset;
                    if (sect.reloff > 0) {
                        sect.reloff += file_offset;
                    }
                }

                seg.inner.fileoff += file_offset;
            },
            .SYMTAB => {
                self.symtab_cmd_index = i;
                cmd.symtab.symoff += file_offset;
                cmd.symtab.stroff += file_offset;
            },
            .DYSYMTAB => {
                self.dysymtab_cmd_index = i;
            },
            .BUILD_VERSION => {
                self.build_version_cmd_index = i;
            },
            .DATA_IN_CODE => {
                self.data_in_code_cmd_index = i;
                cmd.linkedit_data.dataoff += file_offset;
            },
            else => {
                log.debug("Unknown load command detected: 0x{x}.", .{cmd.cmd()});
            },
        }
        self.load_commands.appendAssumeCapacity(cmd);
    }

    try self.parseSymtab(allocator);
}

const Context = struct {
    symtab: []const macho.nlist_64,
    strtab: []const u8,
};

const SymbolAtIndex = struct {
    index: u32,

    fn getSymbol(self: SymbolAtIndex, ctx: Context) macho.nlist_64 {
        return ctx.symtab[self.index];
    }

    fn getSymbolName(self: SymbolAtIndex, ctx: Context) []const u8 {
        const sym = self.getSymbol(ctx);
        assert(sym.n_strx < ctx.strtab.len);
        return mem.sliceTo(@ptrCast([*:0]const u8, ctx.strtab.ptr + sym.n_strx), 0);
    }

    /// Returns whether lhs is less than rhs by allocated address in object file.
    /// Undefined symbols are pushed to the back (always evaluate to true).
    fn lessThan(ctx: Context, lhs_index: SymbolAtIndex, rhs_index: SymbolAtIndex) bool {
        const lhs = lhs_index.getSymbol(ctx);
        const rhs = rhs_index.getSymbol(ctx);
        if (lhs.sect()) {
            if (rhs.sect()) {
                // Same group, sort by address.
                return lhs.n_value < rhs.n_value;
            } else {
                return true;
            }
        } else {
            return false;
        }
    }

    /// Returns whether lhs is less senior than rhs. The rules are:
    /// 1. ext
    /// 2. weak
    /// 3. local
    /// 4. temp (local starting with `l` prefix).
    fn lessThanBySeniority(ctx: Context, lhs_index: SymbolAtIndex, rhs_index: SymbolAtIndex) bool {
        const lhs = lhs_index.getSymbol(ctx);
        const rhs = rhs_index.getSymbol(ctx);
        if (!rhs.ext()) {
            const lhs_name = lhs_index.getSymbolName(ctx);
            return mem.startsWith(u8, lhs_name, "l") or mem.startsWith(u8, lhs_name, "L");
        } else if (rhs.pext() or rhs.weakDef()) {
            return !lhs.ext();
        } else {
            return false;
        }
    }

    /// Like lessThanBySeniority but negated.
    fn greaterThanBySeniority(ctx: Context, lhs_index: SymbolAtIndex, rhs_index: SymbolAtIndex) bool {
        return !lessThanBySeniority(ctx, lhs_index, rhs_index);
    }
};

fn filterSymbolsByAddress(
    indexes: []SymbolAtIndex,
    start_addr: u64,
    end_addr: u64,
    ctx: Context,
) []SymbolAtIndex {
    const Predicate = struct {
        addr: u64,
        ctx: Context,

        pub fn predicate(pred: @This(), index: SymbolAtIndex) bool {
            return index.getSymbol(pred.ctx).n_value >= pred.addr;
        }
    };

    const start = MachO.findFirst(SymbolAtIndex, indexes, 0, Predicate{
        .addr = start_addr,
        .ctx = ctx,
    });
    const end = MachO.findFirst(SymbolAtIndex, indexes, start, Predicate{
        .addr = end_addr,
        .ctx = ctx,
    });

    return indexes[start..end];
}

fn filterRelocs(
    relocs: []const macho.relocation_info,
    start_addr: u64,
    end_addr: u64,
) []const macho.relocation_info {
    const Predicate = struct {
        addr: u64,

        pub fn predicate(self: @This(), rel: macho.relocation_info) bool {
            return rel.r_address < self.addr;
        }
    };

    const start = MachO.findFirst(macho.relocation_info, relocs, 0, Predicate{ .addr = end_addr });
    const end = MachO.findFirst(macho.relocation_info, relocs, start, Predicate{ .addr = start_addr });

    return relocs[start..end];
}

/// Splits object into atoms assuming one-shot linking mode.
pub fn splitIntoAtomsOneShot(self: *Object, macho_file: *MachO, object_id: u32) !void {
    assert(macho_file.mode == .one_shot);

    const tracy = trace(@src());
    defer tracy.end();

    const gpa = macho_file.base.allocator;
    const seg = self.load_commands.items[self.segment_cmd_index.?].segment;

    log.debug("splitting object({d}, {s}) into atoms: one-shot mode", .{ object_id, self.name });

    // You would expect that the symbol table is at least pre-sorted based on symbol's type:
    // local < extern defined < undefined. Unfortunately, this is not guaranteed! For instance,
    // the GO compiler does not necessarily respect that therefore we sort immediately by type
    // and address within.
    const context = Context{
        .symtab = self.getSourceSymtab(),
        .strtab = self.strtab,
    };
    var sorted_all_syms = try std.ArrayList(SymbolAtIndex).initCapacity(gpa, context.symtab.len);
    defer sorted_all_syms.deinit();

    for (context.symtab) |_, index| {
        sorted_all_syms.appendAssumeCapacity(.{ .index = @intCast(u32, index) });
    }

    // We sort by type: defined < undefined, and
    // afterwards by address in each group. Normally, dysymtab should
    // be enough to guarantee the sort, but turns out not every compiler
    // is kind enough to specify the symbols in the correct order.
    sort.sort(SymbolAtIndex, sorted_all_syms.items, context, SymbolAtIndex.lessThan);

    // Well, shit, sometimes compilers skip the dysymtab load command altogether, meaning we
    // have to infer the start of undef section in the symtab ourselves.
    const iundefsym = if (self.dysymtab_cmd_index) |cmd_index| blk: {
        const dysymtab = self.load_commands.items[cmd_index].dysymtab;
        break :blk dysymtab.iundefsym;
    } else blk: {
        var iundefsym: usize = sorted_all_syms.items.len;
        while (iundefsym > 0) : (iundefsym -= 1) {
            const sym = sorted_all_syms.items[iundefsym - 1].getSymbol(context);
            if (sym.sect()) break;
        }
        break :blk iundefsym;
    };

    // We only care about defined symbols, so filter every other out.
    const sorted_syms = sorted_all_syms.items[0..iundefsym];
    const subsections_via_symbols = self.header.flags & macho.MH_SUBSECTIONS_VIA_SYMBOLS != 0;

    for (seg.sections.items) |sect, id| {
        const sect_id = @intCast(u8, id);
        log.debug("splitting section '{s},{s}' into atoms", .{ sect.segName(), sect.sectName() });

        // Get matching segment/section in the final artifact.
        const match = (try macho_file.getMatchingSection(sect)) orelse {
            log.debug("  unhandled section", .{});
            continue;
        };

        log.debug("  output sect({d}, '{s},{s}')", .{
            macho_file.getSectionOrdinal(match),
            macho_file.getSection(match).segName(),
            macho_file.getSection(match).sectName(),
        });

        const cpu_arch = macho_file.base.options.target.cpu.arch;
        const is_zerofill = blk: {
            const section_type = sect.type_();
            break :blk section_type == macho.S_ZEROFILL or section_type == macho.S_THREAD_LOCAL_ZEROFILL;
        };

        // Read section's code
        const code: ?[]const u8 = if (!is_zerofill) try self.getSectionContents(sect_id) else null;

        // Read section's list of relocations
        const raw_relocs = self.contents[sect.reloff..][0 .. sect.nreloc * @sizeOf(macho.relocation_info)];
        const relocs = mem.bytesAsSlice(
            macho.relocation_info,
            @alignCast(@alignOf(macho.relocation_info), raw_relocs),
        );

        // Symbols within this section only.
        const filtered_syms = filterSymbolsByAddress(
            sorted_syms,
            sect.addr,
            sect.addr + sect.size,
            context,
        );

        if (subsections_via_symbols and filtered_syms.len > 0) {
            // If the first nlist does not match the start of the section,
            // then we need to encapsulate the memory range [section start, first symbol)
            // as a temporary symbol and insert the matching Atom.
            const first_sym = filtered_syms[0].getSymbol(context);
            if (first_sym.n_value > sect.addr) {
                const sym_index = self.sections_as_symbols.get(sect_id) orelse blk: {
                    const sym_index = @intCast(u32, self.symtab.items.len);
                    try self.symtab.append(gpa, .{
                        .n_strx = 0,
                        .n_type = macho.N_SECT,
                        .n_sect = macho_file.getSectionOrdinal(match),
                        .n_desc = 0,
                        .n_value = sect.addr,
                    });
                    try self.sections_as_symbols.putNoClobber(gpa, sect_id, sym_index);
                    break :blk sym_index;
                };
                const atom_size = first_sym.n_value - sect.addr;
                const atom_code: ?[]const u8 = if (code) |cc| blk: {
                    const size = math.cast(usize, atom_size) orelse return error.Overflow;
                    break :blk cc[0..size];
                } else null;
                const atom = try self.createAtomFromSubsection(
                    macho_file,
                    object_id,
                    sym_index,
                    atom_size,
                    sect.@"align",
                    atom_code,
                    relocs,
                    &.{},
                    match,
                    sect,
                );
                try macho_file.addAtomToSection(atom, match);
            }

            var next_sym_count: usize = 0;
            while (next_sym_count < filtered_syms.len) {
                const next_sym = filtered_syms[next_sym_count].getSymbol(context);
                const addr = next_sym.n_value;
                const atom_syms = filterSymbolsByAddress(
                    filtered_syms[next_sym_count..],
                    addr,
                    addr + 1,
                    context,
                );
                next_sym_count += atom_syms.len;

                // We want to bubble up the first externally defined symbol here.
                assert(atom_syms.len > 0);
                var sorted_atom_syms = std.ArrayList(SymbolAtIndex).init(gpa);
                defer sorted_atom_syms.deinit();
                try sorted_atom_syms.appendSlice(atom_syms);
                sort.sort(
                    SymbolAtIndex,
                    sorted_atom_syms.items,
                    context,
                    SymbolAtIndex.greaterThanBySeniority,
                );

                const atom_size = blk: {
                    const end_addr = if (next_sym_count < filtered_syms.len)
                        filtered_syms[next_sym_count].getSymbol(context).n_value
                    else
                        sect.addr + sect.size;
                    break :blk end_addr - addr;
                };
                const atom_code: ?[]const u8 = if (code) |cc| blk: {
                    const start = math.cast(usize, addr - sect.addr) orelse return error.Overflow;
                    const size = math.cast(usize, atom_size) orelse return error.Overflow;
                    break :blk cc[start..][0..size];
                } else null;
                const atom_align = if (addr > 0)
                    math.min(@ctz(u64, addr), sect.@"align")
                else
                    sect.@"align";
                const atom = try self.createAtomFromSubsection(
                    macho_file,
                    object_id,
                    sorted_atom_syms.items[0].index,
                    atom_size,
                    atom_align,
                    atom_code,
                    relocs,
                    sorted_atom_syms.items[1..],
                    match,
                    sect,
                );

                if (cpu_arch == .x86_64 and addr == sect.addr) {
                    // In x86_64 relocs, it can so happen that the compiler refers to the same
                    // atom by both the actual assigned symbol and the start of the section. In this
                    // case, we need to link the two together so add an alias.
                    const alias = self.sections_as_symbols.get(sect_id) orelse blk: {
                        const alias = @intCast(u32, self.symtab.items.len);
                        try self.symtab.append(gpa, .{
                            .n_strx = 0,
                            .n_type = macho.N_SECT,
                            .n_sect = macho_file.getSectionOrdinal(match),
                            .n_desc = 0,
                            .n_value = addr,
                        });
                        try self.sections_as_symbols.putNoClobber(gpa, sect_id, alias);
                        break :blk alias;
                    };
                    try atom.contained.append(gpa, .{
                        .sym_index = alias,
                        .offset = 0,
                    });
                    try self.atom_by_index_table.put(gpa, alias, atom);
                }

                try macho_file.addAtomToSection(atom, match);
            }
        } else {
            // If there is no symbol to refer to this atom, we create
            // a temp one, unless we already did that when working out the relocations
            // of other atoms.
            const sym_index = self.sections_as_symbols.get(sect_id) orelse blk: {
                const sym_index = @intCast(u32, self.symtab.items.len);
                try self.symtab.append(gpa, .{
                    .n_strx = 0,
                    .n_type = macho.N_SECT,
                    .n_sect = macho_file.getSectionOrdinal(match),
                    .n_desc = 0,
                    .n_value = sect.addr,
                });
                try self.sections_as_symbols.putNoClobber(gpa, sect_id, sym_index);
                break :blk sym_index;
            };
            const atom = try self.createAtomFromSubsection(
                macho_file,
                object_id,
                sym_index,
                sect.size,
                sect.@"align",
                code,
                relocs,
                filtered_syms,
                match,
                sect,
            );
            try macho_file.addAtomToSection(atom, match);
        }
    }
}

fn createAtomFromSubsection(
    self: *Object,
    macho_file: *MachO,
    object_id: u32,
    sym_index: u32,
    size: u64,
    alignment: u32,
    code: ?[]const u8,
    relocs: []const macho.relocation_info,
    indexes: []const SymbolAtIndex,
    match: MatchingSection,
    sect: macho.section_64,
) !*Atom {
    const gpa = macho_file.base.allocator;
    const sym = self.symtab.items[sym_index];
    const atom = try MachO.createEmptyAtom(gpa, sym_index, size, alignment);
    atom.file = object_id;
    self.symtab.items[sym_index].n_sect = macho_file.getSectionOrdinal(match);

    log.debug("creating ATOM(%{d}, '{s}') in sect({d}, '{s},{s}') in object({d})", .{
        sym_index,
        self.getString(sym.n_strx),
        macho_file.getSectionOrdinal(match),
        macho_file.getSection(match).segName(),
        macho_file.getSection(match).sectName(),
        object_id,
    });

    try self.atom_by_index_table.putNoClobber(gpa, sym_index, atom);
    try self.managed_atoms.append(gpa, atom);

    if (code) |cc| {
        assert(size == cc.len);
        mem.copy(u8, atom.code.items, cc);
    }

    const base_offset = sym.n_value - sect.addr;
    const filtered_relocs = filterRelocs(relocs, base_offset, base_offset + size);
    try atom.parseRelocs(filtered_relocs, .{
        .macho_file = macho_file,
        .base_addr = sect.addr,
        .base_offset = @intCast(i32, base_offset),
    });

    // Since this is atom gets a helper local temporary symbol that didn't exist
    // in the object file which encompasses the entire section, we need traverse
    // the filtered symbols and note which symbol is contained within so that
    // we can properly allocate addresses down the line.
    // While we're at it, we need to update segment,section mapping of each symbol too.
    try atom.contained.ensureTotalCapacity(gpa, indexes.len);
    for (indexes) |inner_sym_index| {
        const inner_sym = &self.symtab.items[inner_sym_index.index];
        inner_sym.n_sect = macho_file.getSectionOrdinal(match);
        atom.contained.appendAssumeCapacity(.{
            .sym_index = inner_sym_index.index,
            .offset = inner_sym.n_value - sym.n_value,
        });

        try self.atom_by_index_table.putNoClobber(gpa, inner_sym_index.index, atom);
    }

    return atom;
}

fn parseSymtab(self: *Object, allocator: Allocator) !void {
    const index = self.symtab_cmd_index orelse return;
    const symtab = self.load_commands.items[index].symtab;
    try self.symtab.appendSlice(allocator, self.getSourceSymtab());
    self.strtab = self.contents[symtab.stroff..][0..symtab.strsize];
}

pub fn getSourceSymtab(self: Object) []const macho.nlist_64 {
    const index = self.symtab_cmd_index orelse return &[0]macho.nlist_64{};
    const symtab = self.load_commands.items[index].symtab;
    const symtab_size = @sizeOf(macho.nlist_64) * symtab.nsyms;
    const raw_symtab = self.contents[symtab.symoff..][0..symtab_size];
    return mem.bytesAsSlice(
        macho.nlist_64,
        @alignCast(@alignOf(macho.nlist_64), raw_symtab),
    );
}

pub fn getSourceSymbol(self: Object, index: u32) ?macho.nlist_64 {
    const symtab = self.getSourceSymtab();
    if (index >= symtab.len) return null;
    return symtab[index];
}

pub fn getSourceSection(self: Object, index: u16) macho.section_64 {
    const seg = self.load_commands.items[self.segment_cmd_index.?].segment;
    assert(index < seg.sections.items.len);
    return seg.sections.items[index];
}

pub fn parseDataInCode(self: Object) ?[]const macho.data_in_code_entry {
    const index = self.data_in_code_cmd_index orelse return null;
    const data_in_code = self.load_commands.items[index].linkedit_data;
    const raw_dice = self.contents[data_in_code.dataoff..][0..data_in_code.datasize];
    return mem.bytesAsSlice(
        macho.data_in_code_entry,
        @alignCast(@alignOf(macho.data_in_code_entry), raw_dice),
    );
}

pub fn getSectionContents(self: Object, index: u16) error{Overflow}![]const u8 {
    const sect = self.getSourceSection(index);
    const size = math.cast(usize, sect.size) orelse return error.Overflow;
    log.debug("getting {s},{s} data at 0x{x} - 0x{x}", .{
        sect.segName(),
        sect.sectName(),
        sect.offset,
        sect.offset + sect.size,
    });
    return self.contents[sect.offset..][0..size];
}

pub fn getString(self: Object, off: u32) []const u8 {
    assert(off < self.strtab.len);
    return mem.sliceTo(@ptrCast([*:0]const u8, self.strtab.ptr + off), 0);
}

pub fn getAtomForSymbol(self: Object, sym_index: u32) ?*Atom {
    return self.atom_by_index_table.get(sym_index);
}