Skip to content

Commit c70b155

Browse files
committed
Respect MIME type aliases
* Warns when extending a type with preexisting extensions, parents, etc. * Warns when extending an aliased type. * MimeType.canonicalize type, instead_of: old to replace a canonical type and make it an alias. Common problem with types like WAV with multiple competing types, RFCs that aren't actually followed, and browser support trumping all. Allows us to override Tika with what browsers actually do.
1 parent 170458c commit c70b155

12 files changed

+327
-30
lines changed

lib/marcel/magic.rb

+56-5
Original file line numberDiff line numberDiff line change
@@ -25,25 +25,58 @@ def initialize(type)
2525
# Option keys:
2626
# * <i>:extensions</i>: String list or single string of file extensions
2727
# * <i>:parents</i>: String list or single string of parent mime types
28+
# * <i>:aliases</i>: String list or single string of aliased mime types
2829
# * <i>:magic</i>: Mime magic specification
2930
# * <i>:comment</i>: Comment string
3031
def self.add(type, options)
3132
extensions = [options[:extensions]].flatten.compact
33+
extensions.each {|ext| EXTENSIONS[ext] = type }
3234
TYPE_EXTS[type] = extensions
35+
36+
TYPE_ALIASES.delete(type)
37+
[options[:aliases]].flatten.compact.each do |aliased|
38+
TYPE_ALIASES[aliased] = type
39+
end
40+
3341
parents = [options[:parents]].flatten.compact
3442
TYPE_PARENTS[type] = parents unless parents.empty?
35-
extensions.each {|ext| EXTENSIONS[ext] = type }
43+
3644
MAGIC.unshift [type, options[:magic]] if options[:magic]
3745
end
3846

39-
# Removes a mime type from the dictionary. You might want to do this if
47+
# Override the canonical MIME type with an alias or subtype.
48+
def self.canonicalize(type, instead_of:)
49+
raise ArgumentError, "#{instead_of} is an alias, not canonical" if TYPE_ALIASES[instead_of]
50+
51+
# Remove the alias or subtype first
52+
remove(type)
53+
54+
# Replace the old canonical
55+
EXTENSIONS.select { |_, t| t == instead_of }.each_key do |ext|
56+
EXTENSIONS[ext] = type
57+
end
58+
59+
TYPE_ALIASES.select { |_, t| t == instead_of }.each_key do |aliased|
60+
TYPE_ALIASES[aliased] = type
61+
end
62+
63+
TYPE_PARENTS[type] = TYPE_PARENTS.delete(instead_of)
64+
65+
MAGIC.select { |t, _| t == instead_of }.each { |pair| pair[0] = type }
66+
67+
# Alias the old canonical
68+
TYPE_ALIASES[instead_of] = type
69+
end
70+
71+
# Removes a mime type from the dictionary. You might want to do this if
4072
# you're seeing impossible conflicts (for instance, application/x-gmc-link).
41-
# * <i>type</i>: The mime type to remove. All associated extensions and magic are removed too.
73+
# * <i>type</i>: The mime type to remove.
4274
def self.remove(type)
43-
EXTENSIONS.delete_if {|ext, t| t == type }
44-
MAGIC.delete_if {|t, m| t == type }
75+
EXTENSIONS.delete_if { |ext, t| t == type }
76+
MAGIC.delete_if { |t, m| t == type }
4577
TYPE_EXTS.delete(type)
4678
TYPE_PARENTS.delete(type)
79+
TYPE_ALIASES.delete_if { |aliased, canonical| aliased == type || canonical == type }
4780
end
4881

4982
# Returns true if type is a text format
@@ -64,11 +97,24 @@ def extensions
6497
TYPE_EXTS[type] || []
6598
end
6699

100+
def canonical
101+
if to = TYPE_ALIASES[type]
102+
self.class.new(to)
103+
else
104+
self
105+
end
106+
end
107+
67108
# Get mime comment
68109
def comment
69110
nil # deprecated
70111
end
71112

113+
# Lookup canonical mime type by mime type string
114+
def self.by_type(type)
115+
new(type.downcase).canonical if type
116+
end
117+
72118
# Lookup mime type by file extension
73119
def self.by_extension(ext)
74120
ext = ext.to_s.downcase
@@ -111,9 +157,14 @@ def hash
111157
alias == eql?
112158

113159
def self.child?(child, parent)
160+
child, parent = canonical(child), canonical(parent)
114161
child == parent || TYPE_PARENTS[child]&.any? {|p| child?(p, parent) }
115162
end
116163

164+
def self.canonical(aliased_type)
165+
by_type(aliased_type)&.type
166+
end
167+
117168
def self.magic_match(io, method)
118169
return magic_match(StringIO.new(io.to_s), method) unless io.respond_to?(:read)
119170

lib/marcel/mime_type.rb

+32-6
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,37 @@ class MimeType
55
BINARY = "application/octet-stream"
66

77
class << self
8-
def extend(type, extensions: [], parents: [], magic: nil)
9-
extensions = (Array(extensions) + Array(Marcel::TYPE_EXTS[type])).uniq
10-
parents = (Array(parents) + Array(Marcel::TYPE_PARENTS[type])).uniq
11-
Magic.add(type, extensions: extensions, magic: magic, parents: parents)
8+
def canonicalize(type, instead_of:)
9+
Magic.canonicalize type, instead_of: instead_of
10+
end
11+
12+
def extend(type, extensions: nil, aliases: nil, parents: nil, magic: nil)
13+
extensions = Array(extensions)
14+
if extensions.any? && extensions.sort == Array(Marcel::TYPE_EXTS[type]).sort
15+
warn "#{type} already has extensions #{extensions.inspect}"
16+
end
17+
extensions |= Array(Marcel::TYPE_EXTS[type])
18+
19+
aliases = Array(aliases)
20+
existing_aliases = Marcel::TYPE_ALIASES.select { |_, t| t == type }.keys
21+
if aliases.any? && aliases.sort == existing_aliases.sort
22+
warn "#{type} already has aliases #{aliases.inspect}"
23+
end
24+
aliases |= existing_aliases
25+
26+
parents = Array(parents)
27+
if parents.any? && parents.sort == Array(Marcel::TYPE_PARENTS[type]).sort
28+
warn "#{type} already has parents #{parents.inspect}"
29+
end
30+
parents |= Array(Marcel::TYPE_PARENTS[type])
31+
32+
magic = Array(magic)
33+
existing_magic = Marcel::MAGIC.select { |type, _| type == type }.map(&:last)
34+
if magic.any? && magic == existing_magic
35+
warn "#{type} already has magic matchers #{magic.inspect}"
36+
end
37+
38+
Magic.add type, extensions: extensions, magic: magic, aliases: aliases, parents: parents
1239
end
1340

1441
# Returns the most appropriate content type for the given file.
@@ -32,7 +59,6 @@ def for(pathname_or_io = nil, name: nil, extension: nil, declared_type: nil)
3259
end
3360

3461
private
35-
3662
def for_data(pathname_or_io)
3763
if pathname_or_io
3864
with_io(pathname_or_io) do |io|
@@ -60,7 +86,7 @@ def for_extension(extension)
6086
end
6187

6288
def for_declared_type(declared_type)
63-
type = parse_media_type(declared_type)
89+
type = Marcel::Magic.canonical(parse_media_type(declared_type))
6490

6591
# application/octet-stream is treated as an undeclared/missing type,
6692
# allowing the type to be inferred from the filename. If there's no

lib/marcel/mime_type/definitions.rb

+15-15
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,30 @@
2828
Marcel::MimeType.extend "application/vnd.ms-powerpoint.template.macroenabled.12", parents: "application/vnd.openxmlformats-officedocument.presentationml.presentation"
2929
Marcel::MimeType.extend "application/vnd.ms-powerpoint.slideshow.macroenabled.12", parents: "application/vnd.openxmlformats-officedocument.presentationml.presentation"
3030

31-
Marcel::MimeType.extend "application/vnd.apple.pages", extensions: %w( pages ), parents: "application/zip"
32-
Marcel::MimeType.extend "application/vnd.apple.numbers", extensions: %w( numbers ), parents: "application/zip"
33-
Marcel::MimeType.extend "application/vnd.apple.keynote", extensions: %w( key ), parents: "application/zip"
31+
Marcel::MimeType.extend "application/vnd.apple.pages", parents: "application/zip"
32+
Marcel::MimeType.extend "application/vnd.apple.numbers", parents: "application/zip"
33+
Marcel::MimeType.extend "application/vnd.apple.keynote", parents: "application/zip"
3434

35-
Marcel::MimeType.extend "audio/aac", extensions: %w( aac ), parents: "audio/x-aac"
36-
Marcel::MimeType.extend("audio/ogg", extensions: %w( ogg oga ), magic: [[0, 'OggS', [[29, 'vorbis']]]])
35+
# Upstream aliases to application/x-x509-cert. Override with a ;format=pem subtype.
36+
Marcel::MimeType.extend "application/x-x509-ca-cert", magic: [[0, '-----BEGIN CERTIFICATE-----']], extensions: %w( pem ), parents: "application/x-x509-cert;format=pem"
3737

38-
Marcel::MimeType.extend "image/vnd.dwg", magic: [[0, "AC10"]]
38+
Marcel::MimeType.extend "audio/mpc", magic: [[0, "MPCKSH"]], extensions: %w( mpc )
39+
Marcel::MimeType.extend "audio/ogg", extensions: %w( ogg oga ), magic: [[0, 'OggS', [[29, 'vorbis']]]]
40+
Marcel::MimeType.canonicalize "audio/aac", instead_of: "audio/x-aac"
41+
Marcel::MimeType.canonicalize "audio/flac", instead_of: "audio/x-flac"
42+
Marcel::MimeType.canonicalize "audio/x-wav", instead_of: "audio/vnd.wave"
3943

40-
Marcel::MimeType.extend "application/x-x509-ca-cert", magic: [[0, '-----BEGIN CERTIFICATE-----']], extensions: %w( pem ), parents: "application/x-x509-cert;format=pem"
44+
Marcel::MimeType.extend "image/vnd.dwg", magic: [[0, "AC10"]]
4145

42-
Marcel::MimeType.extend "image/avif", magic: [[4, "ftypavif"]], extensions: %w( avif )
43-
Marcel::MimeType.extend "image/heif", magic: [[4, "ftypmif1"]], extensions: %w( heif )
44-
Marcel::MimeType.extend "image/heic", magic: [[4, "ftypheic"]], extensions: %w( heic )
46+
Marcel::MimeType.extend "image/avif", magic: [[4, "ftypavif"]]
47+
Marcel::MimeType.extend "image/heif", magic: [[4, "ftypmif1"]]
48+
Marcel::MimeType.extend "image/heic", magic: [[4, "ftypheic"]]
4549

4650
Marcel::MimeType.extend "image/x-raw-sony", extensions: %w( arw ), parents: "image/tiff"
47-
Marcel::MimeType.extend "image/x-raw-canon", extensions: %w( cr2 crw ), parents: "image/tiff"
51+
Marcel::MimeType.extend "image/x-raw-canon", parents: "image/tiff"
4852

4953
Marcel::MimeType.extend "video/mp4", magic: [[4, "ftypisom"], [4, "ftypM4V "]], extensions: %w( mp4 m4v )
5054

51-
Marcel::MimeType.extend "audio/flac", magic: [[0, 'fLaC']], extensions: %w( flac ), parents: "audio/x-flac"
52-
Marcel::MimeType.extend "audio/x-wav", magic: [[0, 'RIFF', [[8, 'WAVE']]]], extensions: %w( wav ), parents: "audio/vnd.wav"
53-
Marcel::MimeType.extend "audio/mpc", magic: [[0, "MPCKSH"]], extensions: %w( mpc )
54-
5555
Marcel::MimeType.extend "font/ttf", magic: [[0, "\x00\x01\x00\x00"]], extensions: %w( ttf ttc )
5656
Marcel::MimeType.extend "font/otf", magic: [[0, "OTTO"]], extensions: %w( otf ), parents: "font/ttf"
5757
Marcel::MimeType.extend "application/vnd.adobe.flash.movie", magic: [[0, "FWS"], [0, "CWS"]], extensions: %w( swf )

lib/marcel/tables.rb

+143
Original file line numberDiff line numberDiff line change
@@ -2148,6 +2148,149 @@ module Marcel
21482148
'video/x-sgi-movie' => %w(movie),
21492149
'x-conference/x-cooltalk' => %w(ice), # Cooltalk Audio
21502150
}
2151+
TYPE_ALIASES = {
2152+
'application/bat' => 'application/x-bat',
2153+
'application/x-coreldraw' => 'application/coreldraw',
2154+
'application/x-cdr' => 'application/coreldraw',
2155+
'application/cdr' => 'application/coreldraw',
2156+
'image/x-cdr' => 'application/coreldraw',
2157+
'image/cdr' => 'application/coreldraw',
2158+
'application/x-setupscript' => 'application/inf',
2159+
'application/x-wine-extension-inf' => 'application/inf',
2160+
'application/x-javascript' => 'application/javascript',
2161+
'text/javascript' => 'application/javascript',
2162+
'application/x-java-vm' => 'application/java-vm',
2163+
'application/x-java' => 'application/java-vm',
2164+
'application/mac-binhex' => 'application/mac-binhex40',
2165+
'application/binhex' => 'application/mac-binhex40',
2166+
'application/vnd.ms-word' => 'application/msword',
2167+
'application/x-ogg' => 'audio/vorbis',
2168+
'application/msonenote' => 'application/onenote',
2169+
'application/x-pdf' => 'application/pdf',
2170+
'application/pgp' => 'application/pgp-encrypted',
2171+
'text/rss' => 'application/rss+xml',
2172+
'text/rtf' => 'application/rtf',
2173+
'application/smil' => 'application/smil+xml',
2174+
'application/x-kchart' => 'application/vnd.kde.kchart',
2175+
'application/x-kpresenter' => 'application/vnd.kde.kpresenter',
2176+
'application/x-kspread' => 'application/vnd.kde.kspread',
2177+
'application/x-kword' => 'application/vnd.kde.kword',
2178+
'application/x-koan' => 'application/vnd.koan',
2179+
'application/x-123' => 'application/vnd.lotus-1-2-3',
2180+
'application/x-mif' => 'application/vnd.mif',
2181+
'application/x-frame' => 'application/vnd.mif',
2182+
'application/msexcel' => 'application/vnd.ms-excel',
2183+
'application/mspowerpoint' => 'application/vnd.ms-powerpoint',
2184+
'application/ms-tnef' => 'application/vnd.ms-tnef',
2185+
'application/oxps' => 'application/vnd.ms-xpsdocument',
2186+
'application/x-vnd.oasis.opendocument.chart' => 'application/vnd.oasis.opendocument.chart',
2187+
'application/x-vnd.oasis.opendocument.chart-template' => 'application/vnd.oasis.opendocument.chart-template',
2188+
'application/vnd.oasis.opendocument.database' => 'application/vnd.oasis.opendocument.base',
2189+
'application/x-vnd.oasis.opendocument.formula' => 'application/vnd.oasis.opendocument.formula',
2190+
'application/x-vnd.oasis.opendocument.formula-template' => 'application/vnd.oasis.opendocument.formula-template',
2191+
'application/x-vnd.oasis.opendocument.graphics' => 'application/vnd.oasis.opendocument.graphics',
2192+
'application/x-vnd.oasis.opendocument.graphics-template' => 'application/vnd.oasis.opendocument.graphics-template',
2193+
'application/x-vnd.oasis.opendocument.image' => 'application/vnd.oasis.opendocument.image',
2194+
'application/x-vnd.oasis.opendocument.image-template' => 'application/vnd.oasis.opendocument.image-template',
2195+
'application/x-vnd.oasis.opendocument.presentation' => 'application/vnd.oasis.opendocument.presentation',
2196+
'application/x-vnd.oasis.opendocument.presentation-template' => 'application/vnd.oasis.opendocument.presentation-template',
2197+
'application/x-vnd.oasis.opendocument.spreadsheet' => 'application/vnd.oasis.opendocument.spreadsheet',
2198+
'application/x-vnd.oasis.opendocument.spreadsheet-template' => 'application/vnd.oasis.opendocument.spreadsheet-template',
2199+
'application/x-vnd.oasis.opendocument.text' => 'application/vnd.oasis.opendocument.text',
2200+
'application/x-vnd.oasis.opendocument.text-master' => 'application/vnd.oasis.opendocument.text-master',
2201+
'application/x-vnd.oasis.opendocument.text-template' => 'application/vnd.oasis.opendocument.text-template',
2202+
'application/x-vnd.oasis.opendocument.text-web' => 'application/vnd.oasis.opendocument.text-web',
2203+
'application/x-vnd.sun.xml.writer' => 'application/vnd.sun.xml.writer',
2204+
'application/vnd.ms-visio' => 'application/vnd.visio',
2205+
'image/x-targa' => 'image/x-tga',
2206+
'application/x-unix-archive' => 'application/x-archive',
2207+
'application/x-arj-compressed' => 'application/x-arj',
2208+
'application/x-dbm' => 'application/x-berkeley-db',
2209+
'application/vnd.debian.binary-package' => 'application/x-debian-package',
2210+
'application/x-Gnumeric-spreadsheet' => 'application/x-gnumeric',
2211+
'application/x-gzip' => 'application/gzip',
2212+
'application/x-gunzip' => 'application/gzip',
2213+
'application/gzipped' => 'application/gzip',
2214+
'application/gzip-compressed' => 'application/gzip',
2215+
'application/x-gzip-compressed' => 'application/gzip',
2216+
'gzip/document' => 'application/gzip',
2217+
'application/x-windows-installer' => 'application/x-ms-installer',
2218+
'application/x-msi' => 'application/x-ms-installer',
2219+
'application/x-rar' => 'application/x-rar-compressed',
2220+
'text/x-tex' => 'application/x-tex',
2221+
'text/x-texinfo' => 'application/x-texinfo',
2222+
'application/x-x509-ca-cert' => 'application/x-x509-cert',
2223+
'application/x-x509-user-cert' => 'application/x-x509-cert',
2224+
'text/xml' => 'application/xml',
2225+
'application/x-xml' => 'application/xml',
2226+
'text/x-dtd' => 'application/xml-dtd',
2227+
'text/xml-external-parsed-entity' => 'application/xml-external-parsed-entity',
2228+
'text/xsl' => 'application/xslt+xml',
2229+
'application/x-zip-compressed' => 'application/zip',
2230+
'application/x-deflate' => 'application/zlib',
2231+
'audio/x-m4a' => 'audio/mp4',
2232+
'audio/x-mp4a' => 'audio/mp4',
2233+
'audio/x-mpeg' => 'audio/mpeg',
2234+
'audio/x-ogg-flac' => 'audio/x-oggflac',
2235+
'audio/x-ogg-pcm' => 'audio/x-oggpcm',
2236+
'application/x-speex' => 'audio/speex',
2237+
'audio/aiff' => 'audio/x-aiff',
2238+
'audio/x-realaudio' => 'audio/x-pn-realaudio',
2239+
'audio/x-wav' => 'audio/vnd.wave',
2240+
'audio/wave' => 'audio/vnd.wave',
2241+
'audio/wav' => 'audio/vnd.wave',
2242+
'image/x-bmp' => 'image/bmp',
2243+
'image/x-ms-bmp' => 'image/bmp',
2244+
'image/x-emf' => 'image/emf',
2245+
'application/x-emf' => 'image/emf',
2246+
'application/x-ms-emz' => 'image/x-emf-compressed',
2247+
'image/hevc' => 'image/heic',
2248+
'image/hevc-sequence' => 'image/heic-sequence',
2249+
'video/jpm' => 'image/jpm',
2250+
'image/ntf' => 'image/nitf',
2251+
'image/x-psd' => 'image/vnd.adobe.photoshop',
2252+
'application/photoshop' => 'image/vnd.adobe.photoshop',
2253+
'image/x-dwg' => 'image/vnd.dwg',
2254+
'application/acad' => 'image/vnd.dwg',
2255+
'application/x-acad' => 'image/vnd.dwg',
2256+
'application/autocad_dwg' => 'image/vnd.dwg',
2257+
'application/dwg' => 'image/vnd.dwg',
2258+
'application/x-dwg' => 'image/vnd.dwg',
2259+
'application/x-autocad' => 'image/vnd.dwg',
2260+
'drawing/dwg' => 'image/vnd.dwg',
2261+
'image/x-icon' => 'image/vnd.microsoft.icon',
2262+
'image/x-dcx' => 'image/vnd.zbrush.dcx',
2263+
'image/x-pcx' => 'image/vnd.zbrush.pcx',
2264+
'image/x-pc-paintbrush' => 'image/vnd.zbrush.pcx',
2265+
'image/x-wmf' => 'image/wmf',
2266+
'application/x-msmetafile' => 'image/wmf',
2267+
'image/x-jb2' => 'image/x-jbig2',
2268+
'image/xcf' => 'image/x-xcf',
2269+
'application/x-mimearchive' => 'multipart/related',
2270+
'message/rfc2557' => 'multipart/related',
2271+
'drawing/x-dwf' => 'model/vnd.dwf',
2272+
'text/x-asm' => 'text/x-assembly',
2273+
'application/x-troff' => 'text/troff',
2274+
'application/x-troff-man' => 'text/troff',
2275+
'application/x-troff-me' => 'text/troff',
2276+
'application/x-troff-ms' => 'text/troff',
2277+
'text/x-c' => 'text/x-csrc',
2278+
'text/x-java' => 'text/x-java-source',
2279+
'text/x-properties' => 'text/x-java-properties',
2280+
'text/properties' => 'text/x-java-properties',
2281+
'application/x-httpd-jsp' => 'text/x-jsp',
2282+
'application/matlab-mat' => 'application/x-matlab-data',
2283+
'application/x-tcl' => 'text/x-tcl',
2284+
'video/x-daala' => 'video/daala',
2285+
'video/x-theora' => 'video/theora',
2286+
'video/x-ogg-uvs' => 'video/x-ogguvs',
2287+
'video/x-ogg-yuv' => 'video/x-oggyuv',
2288+
'video/x-ogg-rgb' => 'video/x-oggrgb',
2289+
'video/avi' => 'video/x-msvideo',
2290+
'video/msvideo' => 'video/x-msvideo',
2291+
'application/font-woff' => 'font/woff',
2292+
'application/font-woff2' => 'font/woff2',
2293+
}
21512294
TYPE_PARENTS = {
21522295
'application/bizagi-modeler' => %w(application/zip),
21532296
'application/dash+xml' => %w(application/xml),

script/generate_tables.rb

+7
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def get_matches(mime, parent)
128128

129129
extensions = {}
130130
types = {}
131+
aliases = {}
131132
magics = []
132133

133134
ARGV.each do |path|
@@ -137,6 +138,7 @@ def get_matches(mime, parent)
137138
(doc/'mime-info/mime-type').each do |mime|
138139
comments = Hash[*(mime/'_comment').map {|comment| [comment['xml:lang'], comment.inner_text] }.flatten]
139140
type = mime['type']
141+
(mime/'alias').each { |x| aliases[x['type']] = type }
140142
subclass = (mime/'sub-class-of').map{|x| x['type']}
141143
exts = (mime/'glob').map{|x| x['pattern'] =~ /^\*\.([^\[\]]+)$/ ? $1.downcase : nil }.compact
142144
(mime/'magic').each do |magic|
@@ -222,6 +224,11 @@ def get_matches(mime, parent)
222224
puts " '#{key}' => %w(#{exts}),#{comment}"
223225
end
224226
puts " }"
227+
puts " TYPE_ALIASES = {"
228+
aliases.each do |aliased, type|
229+
puts " '#{aliased}' => '#{type}',"
230+
end
231+
puts " }"
225232
puts " TYPE_PARENTS = {"
226233
types.keys.sort.each do |key|
227234
parents = types[key][1].sort.join(' ')

test/declared_type_test.rb

+5
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,9 @@ class Marcel::MimeType::DeclaredTypeTest < Marcel::TestCase
1919
test "ignores charset declarations" do
2020
assert_equal "text/html", Marcel::MimeType.for(declared_type: "text/html; charset=utf-8")
2121
end
22+
23+
test "resolves declared type to a canonical MIME type" do
24+
aliased, canonical = Marcel::TYPE_ALIASES.first
25+
assert_equal canonical, Marcel::MimeType.for(declared_type: aliased)
26+
end
2227
end

0 commit comments

Comments
 (0)