From 94269a0bc5e3c149e52c5c21e169c4f329fa048e Mon Sep 17 00:00:00 2001 From: Gabriel Vasile Date: Wed, 20 Nov 2019 23:30:14 +0200 Subject: [PATCH 1/2] Fix msoxml detection Look for xlsx, docx, and pptx signatures in zip headers --- internal/matchers/matchers.go | 9 +++++++- internal/matchers/ms_office.go | 39 +++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/internal/matchers/matchers.go b/internal/matchers/matchers.go index e3cb2048..029914ab 100644 --- a/internal/matchers/matchers.go +++ b/internal/matchers/matchers.go @@ -3,7 +3,7 @@ package matchers // ReadLimit is the maximum number of bytes read // from the input when detecting a reader. -const ReadLimit = 2048 +const ReadLimit = 3072 // True is a dummy matching function used to match any input. func True([]byte) bool { @@ -39,3 +39,10 @@ func firstLine(in []byte) []byte { func isWS(b byte) bool { return b == '\t' || b == '\n' || b == '\x0c' || b == '\r' || b == ' ' } + +func min(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/internal/matchers/ms_office.go b/internal/matchers/ms_office.go index d65d5440..9b797bad 100644 --- a/internal/matchers/ms_office.go +++ b/internal/matchers/ms_office.go @@ -3,21 +3,54 @@ package matchers import ( "bytes" "encoding/binary" + "regexp" ) +// msoXML walks through the first 4 zip local file headers and returns whether +// any of the headers contain a file whose name starts with sig. +func msoXML(in, sig []byte) bool { + pkSig := []byte("PK\003\004") + msoXMLreg := "\\[Content_Types\\]\\.xml|_rels/\\.rels|docProps" + + s := string(in[:min(len(in), 8000)]) + if ok, _ := regexp.MatchString(msoXMLreg, s); !ok { + return false + } + + // 30 is the offset where the file name is located in each zip header + lastCheckedIndex := 0 + check := func(in, sig []byte, offset int) bool { + return len(in) > offset && bytes.HasPrefix(in[offset:], sig) + } + + for i := 0; i < 4; i++ { + in = in[lastCheckedIndex:] + pkIndex := bytes.Index(in, pkSig) + if pkIndex == -1 { + return false + } + if check(in, sig, pkIndex+30) { + return true + } + lastCheckedIndex = pkIndex + 30 + } + + return false +} + // Xlsx matches a Microsoft Excel 2007 file. func Xlsx(in []byte) bool { - return bytes.Contains(in, []byte("xl/")) + return msoXML(in, []byte("xl/")) } // Docx matches a Microsoft Office 2007 file. func Docx(in []byte) bool { - return bytes.Contains(in, []byte("word/")) + return msoXML(in, []byte("word/")) } // Pptx matches a Microsoft PowerPoint 2007 file. func Pptx(in []byte) bool { - return bytes.Contains(in, []byte("ppt/")) + return msoXML(in, []byte("ppt/")) } // Ole matches an Open Linking and Embedding file. From 586852d43427d4afa21a006235957d5e2765d250 Mon Sep 17 00:00:00 2001 From: Gabriel Vasile Date: Wed, 20 Nov 2019 23:57:30 +0200 Subject: [PATCH 2/2] Remove useless memory allocation --- internal/matchers/ms_office.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/matchers/ms_office.go b/internal/matchers/ms_office.go index 9b797bad..bc05d62e 100644 --- a/internal/matchers/ms_office.go +++ b/internal/matchers/ms_office.go @@ -6,14 +6,14 @@ import ( "regexp" ) +var msoXMLreg = regexp.MustCompile("\\[Content_Types\\]\\.xml|_rels/\\.rels|docProps") + // msoXML walks through the first 4 zip local file headers and returns whether // any of the headers contain a file whose name starts with sig. func msoXML(in, sig []byte) bool { pkSig := []byte("PK\003\004") - msoXMLreg := "\\[Content_Types\\]\\.xml|_rels/\\.rels|docProps" - s := string(in[:min(len(in), 8000)]) - if ok, _ := regexp.MatchString(msoXMLreg, s); !ok { + if !msoXMLreg.Match(in[:min(len(in), 8000)]) { return false }