Skip to content

Commit

Permalink
Merge branch 'ibraimgm-warc'
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-vasile committed Oct 4, 2019
2 parents 6be6bfc + 0078266 commit 84e752a
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 2 deletions.
5 changes: 5 additions & 0 deletions internal/matchers/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,8 @@ func Rar(in []byte) bool {
}
return len(in) > 8 && (bytes.Equal(in[6:8], []byte{0x01, 0x00}) || in[6] == 0x00)
}

// Warc matches a Web ARChive file
func Warc(in []byte) bool {
return bytes.HasPrefix(in, []byte("WARC/"))
}
1 change: 1 addition & 0 deletions mime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ var files = map[string]*node{
"djvu.djvu": djvu,
"mobi.mobi": mobi,
"lit.lit": lit,
"warc.warc": warc,

// images
"png.png": png,
Expand Down
3 changes: 2 additions & 1 deletion supported_mimes.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 118 Supported MIME types
## 119 Supported MIME types
This file is automatically generated when running tests. Do not edit manually.

Extension | MIME type
Expand Down Expand Up @@ -101,6 +101,7 @@ Extension | MIME type
**tsv** | text/tab-separated-values
**vcf** | text/vcard
**ics** | text/calendar
**warc** | application/warc
**gz** | application/gzip
**class** | application/x-java-applet; charset=binary
**swf** | application/x-shockwave-flash
Expand Down
18 changes: 18 additions & 0 deletions testdata/warc.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
WARC/1.1
WARC-Type: warcinfo
WARC-Date: 2006-09-19T17:20:14Z
WARC-Record-ID: <urn:uuid:d7ae5c10-e6b3-4d27-967d-34780c58ba39>
Content-Type: application/warc-fields
Content-Length: 381

software: Heritrix 1.12.0 http://crawler.archive.org
hostname: crawling017.archive.org
ip: 207.241.227.234
isPartOf: testcrawl-20050708
description: testcrawl with WARC output
operator: IA\_Admin
http-header-user-agent:
Mozilla/5.0 (compatible; heritrix/1.4.0 +http://crawler.archive.org)
format: WARC file version 1.1
conformsTo:
http://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/
3 changes: 2 additions & 1 deletion tree.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ var (
ogg = newNode("application/ogg", "ogg", matchers.Ogg, oggAudio, oggVideo)
oggAudio = newNode("audio/ogg", "oga", matchers.OggAudio)
oggVideo = newNode("video/ogg", "ogv", matchers.OggVideo)
txt = newNode("text/plain", "txt", matchers.Txt, html, svg, xml, php, js, lua, perl, python, json, ndJson, rtf, tcl, csv, tsv, vCard, iCalendar)
txt = newNode("text/plain", "txt", matchers.Txt, html, svg, xml, php, js, lua, perl, python, json, ndJson, rtf, tcl, csv, tsv, vCard, iCalendar, warc)
xml = newNode("text/xml; charset=utf-8", "xml", matchers.Xml, rss, atom, x3d, kml, xliff, collada, gml, gpx, tcx, amf, threemf)
json = newNode("application/json", "json", matchers.Json, geoJson)
csv = newNode("text/csv", "csv", matchers.Csv)
Expand Down Expand Up @@ -132,4 +132,5 @@ var (
lit = newNode("application/x-ms-reader", "lit", matchers.Lit)
sqlite3 = newNode("application/x-sqlite3", "sqlite", matchers.Sqlite)
dwg = newNode("image/vnd.dwg", "dwg", matchers.Dwg)
warc = newNode("application/warc", "warc", matchers.Warc)
)

0 comments on commit 84e752a

Please sign in to comment.