Skip to content

Commit

Permalink
Merge branch 'fix_msoxml'
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-vasile committed Nov 28, 2019
2 parents 22c2461 + 586852d commit 6bcdec2
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 4 deletions.
9 changes: 8 additions & 1 deletion internal/matchers/matchers.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package matchers

// ReadLimit is the maximum number of bytes read
// from the input when detecting a reader.
const ReadLimit = 2048
const ReadLimit = 3072

// True is a dummy matching function used to match any input.
func True([]byte) bool {
Expand Down Expand Up @@ -39,3 +39,10 @@ func firstLine(in []byte) []byte {
func isWS(b byte) bool {
return b == '\t' || b == '\n' || b == '\x0c' || b == '\r' || b == ' '
}

func min(a, b int) int {
if a < b {
return a
}
return b
}
39 changes: 36 additions & 3 deletions internal/matchers/ms_office.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,54 @@ package matchers
import (
"bytes"
"encoding/binary"
"regexp"
)

var msoXMLreg = regexp.MustCompile("\\[Content_Types\\]\\.xml|_rels/\\.rels|docProps")

// msoXML walks through the first 4 zip local file headers and returns whether
// any of the headers contain a file whose name starts with sig.
func msoXML(in, sig []byte) bool {
pkSig := []byte("PK\003\004")

if !msoXMLreg.Match(in[:min(len(in), 8000)]) {
return false
}

// 30 is the offset where the file name is located in each zip header
lastCheckedIndex := 0
check := func(in, sig []byte, offset int) bool {
return len(in) > offset && bytes.HasPrefix(in[offset:], sig)
}

for i := 0; i < 4; i++ {
in = in[lastCheckedIndex:]
pkIndex := bytes.Index(in, pkSig)
if pkIndex == -1 {
return false
}
if check(in, sig, pkIndex+30) {
return true
}
lastCheckedIndex = pkIndex + 30
}

return false
}

// Xlsx matches a Microsoft Excel 2007 file.
func Xlsx(in []byte) bool {
return bytes.Contains(in, []byte("xl/"))
return msoXML(in, []byte("xl/"))
}

// Docx matches a Microsoft Office 2007 file.
func Docx(in []byte) bool {
return bytes.Contains(in, []byte("word/"))
return msoXML(in, []byte("word/"))
}

// Pptx matches a Microsoft PowerPoint 2007 file.
func Pptx(in []byte) bool {
return bytes.Contains(in, []byte("ppt/"))
return msoXML(in, []byte("ppt/"))
}

// Ole matches an Open Linking and Embedding file.
Expand Down

0 comments on commit 6bcdec2

Please sign in to comment.