From 73f66064338fc974a71e78dccc1266e714f9cb7d Mon Sep 17 00:00:00 2001 From: Runxi Yu Date: Sun, 14 Sep 2025 05:45:40 +0800 Subject: [PATCH] Packfile parsing (probably very buggy) --- git/loose.ha | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ git/object.ha | 42 +++++++++++++++++++++++++++++++++++++++++- git/packed.ha | 652 +++++++++++++++++++++++++++++++++++++++++++++++++++++ diff --git a/git/loose.ha b/git/loose.ha index aff13a90d22764bdbaedbc7f84917165381f222d..61c1fa4c2bf04b5c1f256a6be7d688f382b5f31f 100644 --- a/git/loose.ha +++ b/git/loose.ha @@ -73,3 +73,56 @@ } else { return errors::invalid; }; }; + +export fn read_loose_typed( + r: repo, + id: oid, +) ((u8, []u8) | fs::error | io::error | errors::invalid | errors::noentry | strconv::invalid | strconv::overflow | nomem) = { + const rel = loose_relpath(id)?; + defer free(rel); + + let fh = fs::open(r.root, rel)?; + defer io::close(fh)!; + + let zr = zlib::decompress(fh)?; + defer io::close(&zr.vtable)!; + + let buf = io::drain(&zr.vtable)?; + defer free(buf); + + let mnul = bytes::index(buf, 0u8); + if (mnul is void) { + return errors::invalid; + }; + let nul = mnul: size; + + const header = buf[..nul]; + const body = buf[nul + 1z ..]; + + let msp = bytes::index(header, ' '); + if (msp is void) { + return errors::invalid; + }; + let sp = msp: size; + + const ty = strings::fromutf8_unsafe(header[..sp]); + const szs = strings::fromutf8_unsafe(header[sp + 1z ..]); + const expect = strconv::stoz(szs)?; + if (expect != len(body)) { + return errors::invalid; + }; + + let code: u8 = 0u8; + if (ty == "blob") { + code = OBJ_BLOB; + } else if (ty == "tree") { + code = OBJ_TREE; + } else if (ty == "commit") { + code = OBJ_COMMIT; + } else { + return errors::invalid; + }; + + let out = alloc(body...)?; + return (code, out); +}; diff --git a/git/object.ha b/git/object.ha index c4b1269016c02a48f1946958db1f4483270abe2a..04a5e38f94c717ccc3ccde166fb86dacf012eca0 100644 --- a/git/object.ha +++ b/git/object.ha @@ -1,12 +1,21 @@ use bytes; use crypto::sha256; use errors; +use fmt; use fs; use hash; use io; use strconv; +use strings; export type object = (blob | tree | commit); + +export def OBJ_COMMIT: u8 = 1u8; +export def OBJ_TREE: u8 = 2u8; +export def OBJ_BLOB: u8 = 3u8; +export def OBJ_TAG: u8 = 4u8; +export def OBJ_OFS_DELTA: u8 = 6u8; +export def OBJ_REF_DELTA: u8 = 7u8; export fn object_finish(o: object) void = { match (o) { @@ -33,6 +42,37 @@ return false; }; }; +export fn verify_typed(ty: str, body: []u8, want: oid) bool = { + let st = sha256::sha256(); + defer hash::close(&st); + + if (ty == "blob") { + hash::write(&st, strings::toutf8("blob")); + } else if (ty == "tree") { + hash::write(&st, strings::toutf8("tree")); + } else if (ty == "commit") { + hash::write(&st, strings::toutf8("commit")); + } else { + return false; + }; + + hash::write(&st, strings::toutf8(" ")); + let szs = strconv::ztos(len(body)); + hash::write(&st, strings::toutf8(szs)); + hash::write(&st, strings::toutf8("\x00")); + + hash::write(&st, body); + + let got: oid = [0...]; + hash::sum(&st, got); + + if (bytes::equal(got[..], want[..])) { + return true; + } else { + return false; + }; +}; + export fn read_object( r: repo, id: oid, @@ -42,7 +82,7 @@ case let o: object => return o; case let fe: fs::error => if (fe is errors::noentry) { - void; // fallthrough to the packfile thing + void; } else { return fe; }; diff --git a/git/packed.ha b/git/packed.ha index 45339e4c376ecc1a03599b0e63e5dcb49fe68860..5dbe531f8d3329b4d3fa1c63ea0365b61d236603 100644 --- a/git/packed.ha +++ b/git/packed.ha @@ -1,11 +1,663 @@ +use compress::zlib; +use crypto::sha256; +use endian; use errors; +use fmt; use fs; use io; use strconv; +use strings; + +def IDX_MAGIC: u32 = 0xff744f63u32; +def IDX_V2: u32 = 2u32; + +def PACK_MAGIC: u32 = 0x5041434bu32; +def PACK_V2: u32 = 2u32; + +type pack_loc = struct { + pack_rel: str, + ofs: u64, +}; + +fn cmp_oid(a: []u8, b: oid) i32 = { + for (let i = 0z; i < sha256::SZ; i += 1z) { + let av = a[i]; + let bv = b[i]; + if (av < bv) { + return -1; + }; + if (av > bv) { + return 1; + }; + }; + return 0; +}; + +fn count_large_before(off32: []u8, idx: size) size = { + let n = 0z; + for (let i = 0z; i < idx; i += 1z) { + let o32 = endian::begetu32(off32[i*4z .. i*4z + 4z]); + if ((o32 & 0x8000_0000u32) != 0u32) { + n += 1z; + }; + }; + return n; +}; export fn read_packed( r: repo, id: oid, ) (object | fs::error | io::error | errors::invalid | strconv::invalid | strconv::overflow | errors::noentry | nomem) = { + let loc = find_in_indexes(r, id)?; + return read_from_pack_at(r, loc, id); +}; + +fn find_in_indexes( + r: repo, + id: oid, +) (pack_loc | errors::noentry | fs::error | io::error | errors::invalid | nomem) = { + const dir = "objects/pack"; + let it = fs::iter(r.root, dir)?; + defer fs::finish(it); + + for (true) { + match (fs::next(it)) { + case let de: fs::dirent => + if (!strings::hassuffix(de.name, ".idx")) { + continue; + }; + + { + let rel = fmt::asprintf("{}/{}", dir, de.name)?; + match (idx_lookup(r, rel, id)) { + case let pl: pack_loc => + free(rel); + return pl; + case errors::noentry => + free(rel); + continue; + case let fe: fs::error => + free(rel); + return fe; + case let ioe: io::error => + free(rel); + return ioe; + case let inv: errors::invalid => + free(rel); + return inv; + case nomem => + free(rel); + return nomem; + }; + }; + + case done => + break; + case let fe: fs::error => + return fe; + }; + }; + return errors::noentry; }; + +fn idx_lookup( + r: repo, + idx_rel: const str, + id: oid, +) (pack_loc | errors::noentry | fs::error | io::error | errors::invalid | nomem) = { + let h = fs::open(r.root, idx_rel)?; + defer io::close(h)!; + + let buf = io::drain(h)?; + defer free(buf); + + if (len(buf) < 8z + 256z*4z) { + return errors::invalid; + }; + + let off = 0z; + let magic = endian::begetu32(buf[off..off+4]); + off += 4z; + if (magic != IDX_MAGIC) { + return errors::invalid; + }; + + let ver = endian::begetu32(buf[off..off+4]); + off += 4z; + if (ver != IDX_V2) { + return errors::invalid; + }; + + let fanout: [256]u32 = [0...]; + for (let i = 0z; i < 256z; i += 1z) { + fanout[i] = endian::begetu32(buf[off..off+4]); + off += 4z; + }; + let nobj = fanout[255]: size; + + let need = off + + nobj * sha256::SZ + + nobj * 4z + + nobj * 4z + + 2z * sha256::SZ; + if (need > len(buf)) { + return errors::invalid; + }; + + let names_off = off; + let crcs_off = names_off + nobj * sha256::SZ; + let off32_off = crcs_off + nobj * 4z; + + let large_count = 0z; + for (let i = 0z; i < nobj; i += 1z) { + let o32 = endian::begetu32(buf[off32_off + i*4z .. off32_off + i*4z + 4z]); + if ((o32 & 0x8000_0000u32) != 0u32) { + large_count += 1z; + }; + }; + + let off64_off = off32_off + nobj * 4z; + let trailer_off = off64_off + large_count * 8z; + if (trailer_off + 2z * sha256::SZ > len(buf)) { + return errors::invalid; + }; + + let first = (id[0]: u8): size; + let lo: size = if (first == 0u8) { yield 0z; } else { yield fanout[first - 1z]: size; }; + let hi: size = fanout[first]: size; + + let found = false; + let idx = 0z; + let l = lo; + let h = hi; + for (l < h) { + let m = l + (h - l) / 2z; + let cand = buf[names_off + m*sha256::SZ .. names_off + (m+1z)*sha256::SZ]; + + let c = cmp_oid(cand, id); + if (c == 0) { + found = true; + idx = m; + break; + } else if (c < 0) { + l = m + 1z; + } else { + h = m; + }; + }; + + if (!found) { + return errors::noentry; + }; + + let o32 = endian::begetu32(buf[off32_off + idx*4z .. off32_off + idx*4z + 4z]); + let ofs: u64 = 0u64; + if ((o32 & 0x8000_0000u32) == 0u32) { + ofs = (o32: u64); + } else { + let nlarge_before = count_large_before(buf[off32_off..], idx); + let p = off64_off + nlarge_before * 8z; + let o64be = endian::begetu64(buf[p .. p + 8z]); + ofs = o64be; + }; + + if (!strings::hassuffix(idx_rel, ".idx")) { + return errors::invalid; + }; + + let stem = strings::bytesub(idx_rel, 0z, len(idx_rel) - 4z)!; + let packpath = fmt::asprintf("{}{}", stem, ".pack")?; + + return pack_loc { pack_rel = packpath, ofs = ofs }; +}; + +fn read_from_pack_at( + r: repo, + loc: pack_loc, + want: oid, +) (object | fs::error | io::error | errors::invalid | strconv::invalid | strconv::overflow | errors::noentry | nomem) = { + defer free(loc.pack_rel); + + let h = fs::open(r.root, loc.pack_rel)?; + defer io::close(h)!; + + let header: [12]u8 = [0...]; + match (io::readall(h, header)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + let magic = endian::begetu32(header[..4]); + let ver = endian::begetu32(header[4..8]); + if (magic != PACK_MAGIC || ver != PACK_V2) { + return errors::invalid; + }; + + io::seek(h, (loc.ofs: i64), io::whence::SET)?; + let ty: u8 = 0u8; + + match (read_obj_header(h)) { + case let t: (u8, size, size) => + ty = t.0; + case let ioe: io::error => + return ioe; + case => + return errors::invalid; + }; + + let full_ty: u8 = 0u8; + let body: []u8 = []; + defer if (len(body) != 0) { + free(body); + }; + + switch (ty) { + case OBJ_COMMIT => + body = inflate_section(h)?; + full_ty = OBJ_COMMIT; + case OBJ_TREE => + body = inflate_section(h)?; + full_ty = OBJ_TREE; + case OBJ_BLOB => + body = inflate_section(h)?; + full_ty = OBJ_BLOB; + case OBJ_REF_DELTA => + match (resolve_ref_delta(r, h)) { + case let t: (u8, []u8) => + full_ty = t.0; + body = t.1; + case let e: (fs::error | io::error | errors::invalid | errors::noentry | nomem) => + return e; + }; + case OBJ_OFS_DELTA => + match (resolve_ofs_delta(r, h, loc)) { + case let t: (u8, []u8) => + full_ty = t.0; + body = t.1; + case let e: (fs::error | io::error | errors::invalid | errors::noentry | nomem) => + return e; + }; + case => + return errors::invalid; + }; + + let tystr = if (full_ty == OBJ_BLOB) { + yield "blob"; + } else if (full_ty == OBJ_TREE) { + yield "tree"; + } else if (full_ty == OBJ_COMMIT) { + yield "commit"; + } else { + yield ""; + }; + if (tystr == "" || !verify_typed(tystr, body, want)) { + return errors::invalid; + }; + + if (full_ty == OBJ_BLOB) { + const b = parse_blob(want, body)?; + return (b: object); + }; + if (full_ty == OBJ_TREE) { + const t = parse_tree(want, body)?; + return (t: object); + }; + if (full_ty == OBJ_COMMIT) { + const c = parse_commit(want, body)?; + return (c: object); + }; + + return errors::invalid; +}; + +fn read_obj_header(h: io::handle) ((u8, size, size) | io::error | errors::invalid) = { + let consumed = 0z; + + let b0: [1]u8 = [0]; + match (io::readall(h, b0)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + consumed += 1z; + + let ty = (b0[0] >> 4) & 0x07u8; + let sz: size = (b0[0] & 0x0fu8): size; + + let shift = 4z; + if ((b0[0] & 0x80u8) != 0u8) { + for (true) { + let bb: [1]u8 = [0]; + match (io::readall(h, bb)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + consumed += 1z; + + let v = (bb[0] & 0x7fu8): size; + sz += v << shift; + if ((bb[0] & 0x80u8) == 0u8) { + break; + }; + shift += 7z; + }; + }; + + return (ty, sz, consumed); +}; + +fn inflate_section(h: io::handle) ([]u8 | io::error | nomem) = { + let zr = zlib::decompress(h)?; + defer io::close(&zr.vtable)!; + + let out = io::drain(&zr.vtable)?; + return out; +}; + +fn resolve_ref_delta( + r: repo, + h: io::handle, +) ((u8, []u8) | fs::error | io::error | errors::invalid | errors::noentry | strconv::invalid | strconv::overflow | nomem) = { + let base: oid = [0...]; + match (io::readall(h, base)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + + let delta = inflate_section(h)?; + defer free(delta); + + let bt = read_resolved_body_by_id(r, base)?; + let out = apply_delta(bt.1, delta)?; + return (bt.0, out); +}; + +fn read_ofs_distance(h: io::handle) (u64 | io::error | errors::invalid) = { + let b: [1]u8 = [0]; + match (io::readall(h, b)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + + let dist: u64 = (b[0] & 0x7fu8): u64; + + if ((b[0] & 0x80u8) != 0u8) { + for (true) { + match (io::readall(h, b)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + + dist = ((dist + 1u64) << 7u64) + ((b[0] & 0x7fu8): u64); + + if ((b[0] & 0x80u8) == 0u8) { + break; + }; + }; + }; + + return dist; +}; + +fn resolve_ofs_delta( + r: repo, + h: io::handle, + loc: pack_loc, +) ((u8, []u8) | fs::error | io::error | errors::invalid | errors::noentry | strconv::invalid | strconv::overflow | nomem) = { + let dist = read_ofs_distance(h)?; + let base_ofs: u64 = if (loc.ofs > dist) { + yield loc.ofs - dist; + } else { + yield 0u64; + }; + if (base_ofs == 0u64) { + return errors::invalid; + }; + + let bt = read_resolved_body_at_ofs(r, loc.pack_rel, base_ofs)?; + let delta = inflate_section(h)?; + defer free(delta); + + let out = apply_delta(bt.1, delta)?; + return (bt.0, out); +}; + +fn read_resolved_body_by_id( + r: repo, + id: oid, +) ((u8, []u8) | fs::error | io::error | errors::invalid | errors::noentry | strconv::invalid | strconv::overflow | nomem) = { + match (find_in_indexes(r, id)) { + case let pl: pack_loc => + let res = read_resolved_body_at_ofs(r, pl.pack_rel, pl.ofs); + free(pl.pack_rel); + return res; + case errors::noentry => + return read_loose_typed(r, id); + case let fe: fs::error => + return fe; + case let ioe: io::error => + return ioe; + case let inv: errors::invalid => + return inv; + case nomem => + return nomem; + }; +}; + +fn read_resolved_body_at_ofs( + r: repo, + pack_rel: str, + ofs: u64, +) ((u8, []u8) | fs::error | io::error | errors::invalid | errors::noentry | strconv::invalid | strconv::overflow | nomem) = { + let h = fs::open(r.root, pack_rel)?; + defer io::close(h)!; + + let header: [12]u8 = [0...]; + match (io::readall(h, header)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + let magic = endian::begetu32(header[..4]); + let ver = endian::begetu32(header[4..8]); + if (magic != PACK_MAGIC || ver != PACK_V2) { + return errors::invalid; + }; + + io::seek(h, (ofs: i64), io::whence::SET)?; + match (read_obj_header(h)) { + case let t: (u8, size, size) => + switch (t.0) { + case OBJ_COMMIT => + let body = inflate_section(h)?; + return (OBJ_COMMIT, body); + case OBJ_TREE => + let body = inflate_section(h)?; + return (OBJ_TREE, body); + case OBJ_BLOB => + let body = inflate_section(h)?; + return (OBJ_BLOB, body); + case OBJ_REF_DELTA => + let base: oid = [0...]; + match (io::readall(h, base)) { + case size => + void; + case io::EOF => + return errors::invalid; + case let ioe: io::error => + return ioe; + }; + let delta = inflate_section(h)?; + defer free(delta); + let bt = read_resolved_body_by_id(r, base)?; + let out = apply_delta(bt.1, delta)?; + return (bt.0, out); + case OBJ_OFS_DELTA => + let dist = read_ofs_distance(h)?; + let base_ofs: u64 = if (ofs > dist) { + yield ofs - dist; + } else { + yield 0u64; + }; + if (base_ofs == 0u64) { + return errors::invalid; + }; + + let delta = inflate_section(h)?; + defer free(delta); + let bt = read_resolved_body_at_ofs(r, pack_rel, base_ofs)?; + let out = apply_delta(bt.1, delta)?; + return (bt.0, out); + case => + return errors::invalid; + }; + case let ioe: io::error => + return ioe; + case => + return errors::invalid; + }; +}; + +fn apply_delta(base: []u8, delta: []u8) ([]u8 | errors::invalid | nomem) = { + let i = 0z; + + let srcsz = read_varint(delta, &i)?; + let dstsz = read_varint(delta, &i)?; + + if (srcsz != len(base)) { + return errors::invalid; + }; + let out: []u8 = alloc([0u8...], dstsz)?; + let outpos = 0z; + + for (i < len(delta)) { + let op = delta[i]; + i += 1z; + + if ((op & 0x80u8) != 0u8) { + let off = 0z; + let n = 0z; + + if ((op & 0x01u8) != 0u8) { + if (i >= len(delta)) { + return errors::invalid; + }; + off |= (delta[i]: size); + i += 1z; + }; + if ((op & 0x02u8) != 0u8) { + if (i >= len(delta)) { + return errors::invalid; + }; + off |= (delta[i]: size) << 8z; + i += 1z; + }; + if ((op & 0x04u8) != 0u8) { + if (i >= len(delta)) { + return errors::invalid; + }; + off |= (delta[i]: size) << 16z; + i += 1z; + }; + if ((op & 0x08u8) != 0u8) { + if (i >= len(delta)) { + return errors::invalid; + }; + off |= (delta[i]: size) << 24z; + i += 1z; + }; + if ((op & 0x10u8) != 0u8) { + if (i >= len(delta)) { + return errors::invalid; + }; + n |= (delta[i]: size); + i += 1z; + }; + if ((op & 0x20u8) != 0u8) { + if (i >= len(delta)) { + return errors::invalid; + }; + n |= (delta[i]: size) << 8z; + i += 1z; + }; + if ((op & 0x40u8) != 0u8) { + if (i >= len(delta)) { + return errors::invalid; + }; + n |= (delta[i]: size) << 16z; + i += 1z; + }; + if (n == 0z) { + n = 0x10000z; + }; + + if (off + n > len(base) || outpos + n > len(out)) { + return errors::invalid; + }; + + out[outpos .. outpos + n] = base[off .. off + n]; + outpos += n; + } else if (op != 0u8) { + let n = (op: size); + if (i + n > len(delta) || outpos + n > len(out)) { + return errors::invalid; + }; + out[outpos .. outpos + n] = delta[i .. i + n]; + i += n; + outpos += n; + } else { + return errors::invalid; + }; + }; + + if (outpos != len(out)) { + return errors::invalid; + }; + + return out; +}; + +fn read_varint(buf: []u8, ip: *size) (size | errors::invalid) = { + let res = 0z; + let shift = 0z; + for (true) { + if (*ip >= len(buf)) { + return errors::invalid; + }; + let b = buf[*ip]; + *ip += 1z; + + res |= ((b & 0x7fu8): size) << shift; + if ((b & 0x80u8) == 0u8) { + break; + }; + shift += 7z; + }; + return res; +}; -- 2.48.1