Skip to content

Commit

Permalink
Change tar detection to use checksum instead of legal ranges of values
Browse files Browse the repository at this point in the history
Previous detection used the rules from PRONOM. This commit replaces
those rules with the check from github.com/file/file: compute checksum
for header and check if recorded checksum matches.
Fixes #464
  • Loading branch information
gabriel-vasile committed Jan 3, 2024
1 parent 02af149 commit a10ec04
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 39 deletions.
109 changes: 73 additions & 36 deletions internal/magic/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package magic
import (
"bytes"
"encoding/binary"
"strconv"
)

var (
Expand Down Expand Up @@ -74,51 +75,87 @@ func CRX(raw []byte, limit uint32) bool {
}

// Tar matches a (t)ape (ar)chive file.
// Tar files are divided into 512 bytes records. First record contains a 257
// bytes header padded with NUL.
func Tar(raw []byte, _ uint32) bool {
// The "magic" header field for files in in UStar (POSIX IEEE P1003.1) archives
// has the prefix "ustar". The values of the remaining bytes in this field vary
// by archiver implementation.
if len(raw) >= 512 && bytes.HasPrefix(raw[257:], []byte{0x75, 0x73, 0x74, 0x61, 0x72}) {
return true
const sizeRecord = 512

// The structure of a tar header:
// type TarHeader struct {
// Name [100]byte
// Mode [8]byte
// Uid [8]byte
// Gid [8]byte
// Size [12]byte
// Mtime [12]byte
// Chksum [8]byte
// Linkflag byte
// Linkname [100]byte
// Magic [8]byte
// Uname [32]byte
// Gname [32]byte
// Devmajor [8]byte
// Devminor [8]byte
// }

if len(raw) < sizeRecord {
return false
}
raw = raw[:sizeRecord]

if len(raw) < 256 {
// First 100 bytes of the header represent the file name.
// Check if file looks like Gentoo GLEP binary package.
if bytes.Contains(raw[:100], []byte("/gpkg-1\x00")) {
return false
}

// The older v7 format has no "magic" field, and therefore must be identified
// with heuristics based on legal ranges of values for other header fields:
// https://www.nationalarchives.gov.uk/PRONOM/Format/proFormatSearch.aspx?status=detailReport&id=385&strPageToDisplay=signatures
rules := []struct {
min, max uint8
i int
}{
{0x21, 0xEF, 0},
{0x30, 0x37, 105},
{0x20, 0x37, 106},
{0x00, 0x00, 107},
{0x30, 0x37, 113},
{0x20, 0x37, 114},
{0x00, 0x00, 115},
{0x30, 0x37, 121},
{0x20, 0x37, 122},
{0x00, 0x00, 123},
{0x30, 0x37, 134},
{0x30, 0x37, 146},
{0x30, 0x37, 153},
{0x00, 0x37, 154},
// Get the checksum recorded into the file.
recsum, err := tarParseOctal(raw[148:156])
if err != nil {
return false
}
for _, r := range rules {
if raw[r.i] < r.min || raw[r.i] > r.max {
return false
}
sum1, sum2 := tarChksum(raw)
return recsum == sum1 || recsum == sum2
}

// tarParseOctal converts octal string to decimal int.
func tarParseOctal(b []byte) (int64, error) {
// Because unused fields are filled with NULs, we need to skip leading NULs.
// Fields may also be padded with spaces or NULs.
// So we remove leading and trailing NULs and spaces to be sure.
b = bytes.Trim(b, " \x00")

if len(b) == 0 {
return 0, nil
}
x, err := strconv.ParseUint(tarParseString(b), 8, 64)
if err != nil {
return 0, err
}
return int64(x), nil

Check failure

Code scanning / CodeQL

Incorrect conversion between integer types High

Incorrect conversion of an unsigned 64-bit integer from
strconv.ParseUint
to a lower bit size type int64 without an upper bound check.
}

for _, i := range []uint8{135, 147, 155} {
if raw[i] != 0x00 && raw[i] != 0x20 {
return false
}
// tarParseString converts a NUL ended bytes slice to a string.
func tarParseString(b []byte) string {
if i := bytes.IndexByte(b, 0); i >= 0 {
return string(b[:i])
}
return string(b)
}

return true
// tarChksum computes the checksum for the header block b.
// The actual checksum is written to same b block after it has been calculated.
// Before calculation the bytes from b reserved for checksum have placeholder
// value of ASCII space 0x20.
// POSIX specifies a sum of the unsigned byte values, but the Sun tar used
// signed byte values. We compute and return both.
func tarChksum(b []byte) (unsigned, signed int64) {
for i, c := range b {
if 148 <= i && i < 156 {
c = ' ' // Treat the checksum field itself as all spaces.
}
unsigned += int64(c)
signed += int64(int8(c))
}
return unsigned, signed
}
39 changes: 39 additions & 0 deletions internal/magic/archive_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package magic

import "testing"

func TestTarParseOctal(t *testing.T) {
tests := []struct {
in string
want int64
ok bool
}{
{"0000000\x00", 0, true},
{" \x0000000\x00", 0, true},
{" \x0000003\x00", 3, true},
{"00000000227\x00", 0227, true},
{"032033\x00 ", 032033, true},
{"320330\x00 ", 0320330, true},
{"0000660\x00 ", 0660, true},
{"\x00 0000660\x00 ", 0660, true},
{"0123456789abcdef", 0, false},
{"0123456789\x00abcdef", 0, false},
{"01234567\x0089abcdef", 342391, true},
{"0123\x7e\x5f\x264123", 0, false},
}

for _, tt := range tests {
got, err := tarParseOctal([]byte(tt.in))
ok := err == nil
if ok != tt.ok {
if tt.ok {
t.Errorf("parseOctal(%q): got parsing failure, want success", tt.in)
} else {
t.Errorf("parseOctal(%q): got parsing success, want failure", tt.in)
}
}
if got != tt.want {
t.Errorf("parseOctal(%q): got %d, want %d", tt.in, got, tt.want)
}
}
}
7 changes: 4 additions & 3 deletions mimetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,10 @@ var files = map[string]string{
"tar.oldgnu.tar": "application/x-tar",
"tar.posix.tar": "application/x-tar",
// tar.star.tar was generated with star 1.6.
"tar.star.tar": "application/x-tar",
"tar.ustar.tar": "application/x-tar",
"tar.v7.tar": "application/x-tar",
"tar.star.tar": "application/x-tar",
"tar.ustar.tar": "application/x-tar",
"tar.v7.tar": "application/x-tar",
"tar.issue464.tar": "application/x-tar",
// tar.v7-gnu.tar is a v7 tar archive generated with GNU tar 1.29.
"tar.v7-gnu.tar": "application/x-tar",
"tcl.tcl": "text/x-tcl",
Expand Down
Binary file added testdata/tar.issue464.tar
Binary file not shown.

0 comments on commit a10ec04

Please sign in to comment.