From 007826679915200dd4f278d32624d182a0a76ed8 Mon Sep 17 00:00:00 2001 From: Rafael Ibraim Date: Tue, 1 Oct 2019 08:46:06 -0300 Subject: [PATCH] Added support for WARC file types --- internal/matchers/archive.go | 5 +++++ mime_test.go | 1 + supported_mimes.md | 3 ++- testdata/warc.warc | 18 ++++++++++++++++++ tree.go | 3 ++- 5 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 testdata/warc.warc diff --git a/internal/matchers/archive.go b/internal/matchers/archive.go index db3b2150..fcf79327 100644 --- a/internal/matchers/archive.go +++ b/internal/matchers/archive.go @@ -76,3 +76,8 @@ func Rar(in []byte) bool { } return len(in) > 8 && (bytes.Equal(in[6:8], []byte{0x01, 0x00}) || in[6] == 0x00) } + +// Warc matches a Web ARChive file +func Warc(in []byte) bool { + return bytes.HasPrefix(in, []byte("WARC/")) +} diff --git a/mime_test.go b/mime_test.go index ecf80944..36d04363 100644 --- a/mime_test.go +++ b/mime_test.go @@ -48,6 +48,7 @@ var files = map[string]*node{ "djvu.djvu": djvu, "mobi.mobi": mobi, "lit.lit": lit, + "warc.warc": warc, // images "png.png": png, diff --git a/supported_mimes.md b/supported_mimes.md index a99b97cd..0cc6fea8 100644 --- a/supported_mimes.md +++ b/supported_mimes.md @@ -1,4 +1,4 @@ -## 118 Supported MIME types +## 119 Supported MIME types This file is automatically generated when running tests. Do not edit manually. Extension | MIME type @@ -101,6 +101,7 @@ Extension | MIME type **tsv** | text/tab-separated-values **vcf** | text/vcard **ics** | text/calendar +**warc** | application/warc **gz** | application/gzip **class** | application/x-java-applet; charset=binary **swf** | application/x-shockwave-flash diff --git a/testdata/warc.warc b/testdata/warc.warc new file mode 100644 index 00000000..ad235e8e --- /dev/null +++ b/testdata/warc.warc @@ -0,0 +1,18 @@ +WARC/1.1 +WARC-Type: warcinfo +WARC-Date: 2006-09-19T17:20:14Z +WARC-Record-ID: +Content-Type: application/warc-fields +Content-Length: 381 + +software: Heritrix 1.12.0 http://crawler.archive.org +hostname: crawling017.archive.org +ip: 207.241.227.234 +isPartOf: testcrawl-20050708 +description: testcrawl with WARC output +operator: IA\_Admin +http-header-user-agent: + Mozilla/5.0 (compatible; heritrix/1.4.0 +http://crawler.archive.org) +format: WARC file version 1.1 +conformsTo: + http://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ diff --git a/tree.go b/tree.go index 7b58bf7c..7bb3b228 100644 --- a/tree.go +++ b/tree.go @@ -36,7 +36,7 @@ var ( ogg = newNode("application/ogg", "ogg", matchers.Ogg, oggAudio, oggVideo) oggAudio = newNode("audio/ogg", "oga", matchers.OggAudio) oggVideo = newNode("video/ogg", "ogv", matchers.OggVideo) - txt = newNode("text/plain", "txt", matchers.Txt, html, svg, xml, php, js, lua, perl, python, json, ndJson, rtf, tcl, csv, tsv, vCard, iCalendar) + txt = newNode("text/plain", "txt", matchers.Txt, html, svg, xml, php, js, lua, perl, python, json, ndJson, rtf, tcl, csv, tsv, vCard, iCalendar, warc) xml = newNode("text/xml; charset=utf-8", "xml", matchers.Xml, rss, atom, x3d, kml, xliff, collada, gml, gpx, tcx, amf, threemf) json = newNode("application/json", "json", matchers.Json, geoJson) csv = newNode("text/csv", "csv", matchers.Csv) @@ -132,4 +132,5 @@ var ( lit = newNode("application/x-ms-reader", "lit", matchers.Lit) sqlite3 = newNode("application/x-sqlite3", "sqlite", matchers.Sqlite) dwg = newNode("image/vnd.dwg", "dwg", matchers.Dwg) + warc = newNode("application/warc", "warc", matchers.Warc) )