Skip to content

Commit

Permalink
Respect MIME type aliases
Browse files Browse the repository at this point in the history
* MIME type aliases are now supported.
* Aliases are resolved to their canonical type in all APIs.
* Introduce `MimeType.canonicalize type, instead_of: old` to override
  a Tika canonical type with our own, essentially renaming the type
  and making the old type an alias of the new one. Common scenario
  with types like WAV with multiple competing historical types, RFCs
  that aren't actually followed, and browser support trumping them all.
  This allows us to give preference to browsers' actual MIME type
  support while keeping Tika's file extensions and magic byte matchers.
* Warns when extending a type with preexisting extensions, parents, etc.
  and when extending an aliased type.
  • Loading branch information
jeremy committed Mar 7, 2024
1 parent 170458c commit 7a173b1
Show file tree
Hide file tree
Showing 12 changed files with 327 additions and 30 deletions.
61 changes: 56 additions & 5 deletions lib/marcel/magic.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,58 @@ def initialize(type)
# Option keys:
# * <i>:extensions</i>: String list or single string of file extensions
# * <i>:parents</i>: String list or single string of parent mime types
# * <i>:aliases</i>: String list or single string of aliased mime types
# * <i>:magic</i>: Mime magic specification
# * <i>:comment</i>: Comment string
def self.add(type, options)
extensions = [options[:extensions]].flatten.compact
extensions.each {|ext| EXTENSIONS[ext] = type }
TYPE_EXTS[type] = extensions

TYPE_ALIASES.delete(type)
[options[:aliases]].flatten.compact.each do |aliased|
TYPE_ALIASES[aliased] = type
end

parents = [options[:parents]].flatten.compact
TYPE_PARENTS[type] = parents unless parents.empty?
extensions.each {|ext| EXTENSIONS[ext] = type }

MAGIC.unshift [type, options[:magic]] if options[:magic]
end

# Removes a mime type from the dictionary. You might want to do this if
# Override the canonical MIME type with an alias or subtype.
def self.canonicalize(type, instead_of:)
raise ArgumentError, "#{instead_of} is an alias, not canonical" if TYPE_ALIASES[instead_of]

# Remove the alias or subtype first
remove(type)

# Replace the old canonical
EXTENSIONS.select { |_, t| t == instead_of }.each_key do |ext|
EXTENSIONS[ext] = type
end

TYPE_ALIASES.select { |_, t| t == instead_of }.each_key do |aliased|
TYPE_ALIASES[aliased] = type
end

TYPE_PARENTS[type] = TYPE_PARENTS.delete(instead_of)

MAGIC.select { |t, _| t == instead_of }.each { |pair| pair[0] = type }

# Alias the old canonical
TYPE_ALIASES[instead_of] = type
end

# Removes a mime type from the dictionary. You might want to do this if
# you're seeing impossible conflicts (for instance, application/x-gmc-link).
# * <i>type</i>: The mime type to remove. All associated extensions and magic are removed too.
# * <i>type</i>: The mime type to remove.
def self.remove(type)
EXTENSIONS.delete_if {|ext, t| t == type }
MAGIC.delete_if {|t, m| t == type }
EXTENSIONS.delete_if { |ext, t| t == type }
MAGIC.delete_if { |t, m| t == type }
TYPE_EXTS.delete(type)
TYPE_PARENTS.delete(type)
TYPE_ALIASES.delete_if { |aliased, canonical| aliased == type || canonical == type }
end

# Returns true if type is a text format
Expand All @@ -64,11 +97,24 @@ def extensions
TYPE_EXTS[type] || []
end

def canonical
if to = TYPE_ALIASES[type]
self.class.new(to)
else
self
end
end

# Get mime comment
def comment
nil # deprecated
end

# Lookup canonical mime type by mime type string
def self.by_type(type)
new(type.downcase).canonical if type
end

# Lookup mime type by file extension
def self.by_extension(ext)
ext = ext.to_s.downcase
Expand Down Expand Up @@ -111,9 +157,14 @@ def hash
alias == eql?

def self.child?(child, parent)
child, parent = canonical(child), canonical(parent)
child == parent || TYPE_PARENTS[child]&.any? {|p| child?(p, parent) }
end

def self.canonical(aliased_type)
by_type(aliased_type)&.type
end

def self.magic_match(io, method)
return magic_match(StringIO.new(io.to_s), method) unless io.respond_to?(:read)

Expand Down
38 changes: 32 additions & 6 deletions lib/marcel/mime_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,37 @@ class MimeType
BINARY = "application/octet-stream"

class << self
def extend(type, extensions: [], parents: [], magic: nil)
extensions = (Array(extensions) + Array(Marcel::TYPE_EXTS[type])).uniq
parents = (Array(parents) + Array(Marcel::TYPE_PARENTS[type])).uniq
Magic.add(type, extensions: extensions, magic: magic, parents: parents)
def canonicalize(type, instead_of:)
Magic.canonicalize type, instead_of: instead_of
end

def extend(type, extensions: nil, aliases: nil, parents: nil, magic: nil)
extensions = Array(extensions)
if extensions.any? && extensions.sort == Array(Marcel::TYPE_EXTS[type]).sort
warn "#{type} already has extensions #{extensions.inspect}"
end
extensions |= Array(Marcel::TYPE_EXTS[type])

aliases = Array(aliases)
existing_aliases = Marcel::TYPE_ALIASES.select { |_, t| t == type }.keys
if aliases.any? && aliases.sort == existing_aliases.sort
warn "#{type} already has aliases #{aliases.inspect}"
end
aliases |= existing_aliases

parents = Array(parents)
if parents.any? && parents.sort == Array(Marcel::TYPE_PARENTS[type]).sort
warn "#{type} already has parents #{parents.inspect}"
end
parents |= Array(Marcel::TYPE_PARENTS[type])

magic = Array(magic)
existing_magic = Marcel::MAGIC.select { |type, _| type == type }.map(&:last)
if magic.any? && magic == existing_magic
warn "#{type} already has magic matchers #{magic.inspect}"
end

Magic.add type, extensions: extensions, magic: magic, aliases: aliases, parents: parents
end

# Returns the most appropriate content type for the given file.
Expand All @@ -32,7 +59,6 @@ def for(pathname_or_io = nil, name: nil, extension: nil, declared_type: nil)
end

private

def for_data(pathname_or_io)
if pathname_or_io
with_io(pathname_or_io) do |io|
Expand Down Expand Up @@ -60,7 +86,7 @@ def for_extension(extension)
end

def for_declared_type(declared_type)
type = parse_media_type(declared_type)
type = Marcel::Magic.canonical(parse_media_type(declared_type))

# application/octet-stream is treated as an undeclared/missing type,
# allowing the type to be inferred from the filename. If there's no
Expand Down
30 changes: 15 additions & 15 deletions lib/marcel/mime_type/definitions.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,30 @@
Marcel::MimeType.extend "application/vnd.ms-powerpoint.template.macroenabled.12", parents: "application/vnd.openxmlformats-officedocument.presentationml.presentation"
Marcel::MimeType.extend "application/vnd.ms-powerpoint.slideshow.macroenabled.12", parents: "application/vnd.openxmlformats-officedocument.presentationml.presentation"

Marcel::MimeType.extend "application/vnd.apple.pages", extensions: %w( pages ), parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.numbers", extensions: %w( numbers ), parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.keynote", extensions: %w( key ), parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.pages", parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.numbers", parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.keynote", parents: "application/zip"

Marcel::MimeType.extend "audio/aac", extensions: %w( aac ), parents: "audio/x-aac"
Marcel::MimeType.extend("audio/ogg", extensions: %w( ogg oga ), magic: [[0, 'OggS', [[29, 'vorbis']]]])
# Upstream aliases to application/x-x509-cert. Override with a ;format=pem subtype.
Marcel::MimeType.extend "application/x-x509-ca-cert", magic: [[0, '-----BEGIN CERTIFICATE-----']], extensions: %w( pem ), parents: "application/x-x509-cert;format=pem"

Marcel::MimeType.extend "image/vnd.dwg", magic: [[0, "AC10"]]
Marcel::MimeType.extend "audio/mpc", magic: [[0, "MPCKSH"]], extensions: %w( mpc )
Marcel::MimeType.extend "audio/ogg", extensions: %w( ogg oga ), magic: [[0, 'OggS', [[29, 'vorbis']]]]
Marcel::MimeType.canonicalize "audio/aac", instead_of: "audio/x-aac"
Marcel::MimeType.canonicalize "audio/flac", instead_of: "audio/x-flac"
Marcel::MimeType.canonicalize "audio/x-wav", instead_of: "audio/vnd.wave"

Marcel::MimeType.extend "application/x-x509-ca-cert", magic: [[0, '-----BEGIN CERTIFICATE-----']], extensions: %w( pem ), parents: "application/x-x509-cert;format=pem"
Marcel::MimeType.extend "image/vnd.dwg", magic: [[0, "AC10"]]

Marcel::MimeType.extend "image/avif", magic: [[4, "ftypavif"]], extensions: %w( avif )
Marcel::MimeType.extend "image/heif", magic: [[4, "ftypmif1"]], extensions: %w( heif )
Marcel::MimeType.extend "image/heic", magic: [[4, "ftypheic"]], extensions: %w( heic )
Marcel::MimeType.extend "image/avif", magic: [[4, "ftypavif"]]
Marcel::MimeType.extend "image/heif", magic: [[4, "ftypmif1"]]
Marcel::MimeType.extend "image/heic", magic: [[4, "ftypheic"]]

Marcel::MimeType.extend "image/x-raw-sony", extensions: %w( arw ), parents: "image/tiff"
Marcel::MimeType.extend "image/x-raw-canon", extensions: %w( cr2 crw ), parents: "image/tiff"
Marcel::MimeType.extend "image/x-raw-canon", parents: "image/tiff"

Marcel::MimeType.extend "video/mp4", magic: [[4, "ftypisom"], [4, "ftypM4V "]], extensions: %w( mp4 m4v )

Marcel::MimeType.extend "audio/flac", magic: [[0, 'fLaC']], extensions: %w( flac ), parents: "audio/x-flac"
Marcel::MimeType.extend "audio/x-wav", magic: [[0, 'RIFF', [[8, 'WAVE']]]], extensions: %w( wav ), parents: "audio/vnd.wav"
Marcel::MimeType.extend "audio/mpc", magic: [[0, "MPCKSH"]], extensions: %w( mpc )

Marcel::MimeType.extend "font/ttf", magic: [[0, "\x00\x01\x00\x00"]], extensions: %w( ttf ttc )
Marcel::MimeType.extend "font/otf", magic: [[0, "OTTO"]], extensions: %w( otf ), parents: "font/ttf"
Marcel::MimeType.extend "application/vnd.adobe.flash.movie", magic: [[0, "FWS"], [0, "CWS"]], extensions: %w( swf )
Expand Down
143 changes: 143 additions & 0 deletions lib/marcel/tables.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2148,6 +2148,149 @@ module Marcel
'video/x-sgi-movie' => %w(movie),
'x-conference/x-cooltalk' => %w(ice), # Cooltalk Audio
}
TYPE_ALIASES = {
'application/bat' => 'application/x-bat',
'application/x-coreldraw' => 'application/coreldraw',
'application/x-cdr' => 'application/coreldraw',
'application/cdr' => 'application/coreldraw',
'image/x-cdr' => 'application/coreldraw',
'image/cdr' => 'application/coreldraw',
'application/x-setupscript' => 'application/inf',
'application/x-wine-extension-inf' => 'application/inf',
'application/x-javascript' => 'application/javascript',
'text/javascript' => 'application/javascript',
'application/x-java-vm' => 'application/java-vm',
'application/x-java' => 'application/java-vm',
'application/mac-binhex' => 'application/mac-binhex40',
'application/binhex' => 'application/mac-binhex40',
'application/vnd.ms-word' => 'application/msword',
'application/x-ogg' => 'audio/vorbis',
'application/msonenote' => 'application/onenote',
'application/x-pdf' => 'application/pdf',
'application/pgp' => 'application/pgp-encrypted',
'text/rss' => 'application/rss+xml',
'text/rtf' => 'application/rtf',
'application/smil' => 'application/smil+xml',
'application/x-kchart' => 'application/vnd.kde.kchart',
'application/x-kpresenter' => 'application/vnd.kde.kpresenter',
'application/x-kspread' => 'application/vnd.kde.kspread',
'application/x-kword' => 'application/vnd.kde.kword',
'application/x-koan' => 'application/vnd.koan',
'application/x-123' => 'application/vnd.lotus-1-2-3',
'application/x-mif' => 'application/vnd.mif',
'application/x-frame' => 'application/vnd.mif',
'application/msexcel' => 'application/vnd.ms-excel',
'application/mspowerpoint' => 'application/vnd.ms-powerpoint',
'application/ms-tnef' => 'application/vnd.ms-tnef',
'application/oxps' => 'application/vnd.ms-xpsdocument',
'application/x-vnd.oasis.opendocument.chart' => 'application/vnd.oasis.opendocument.chart',
'application/x-vnd.oasis.opendocument.chart-template' => 'application/vnd.oasis.opendocument.chart-template',
'application/vnd.oasis.opendocument.database' => 'application/vnd.oasis.opendocument.base',
'application/x-vnd.oasis.opendocument.formula' => 'application/vnd.oasis.opendocument.formula',
'application/x-vnd.oasis.opendocument.formula-template' => 'application/vnd.oasis.opendocument.formula-template',
'application/x-vnd.oasis.opendocument.graphics' => 'application/vnd.oasis.opendocument.graphics',
'application/x-vnd.oasis.opendocument.graphics-template' => 'application/vnd.oasis.opendocument.graphics-template',
'application/x-vnd.oasis.opendocument.image' => 'application/vnd.oasis.opendocument.image',
'application/x-vnd.oasis.opendocument.image-template' => 'application/vnd.oasis.opendocument.image-template',
'application/x-vnd.oasis.opendocument.presentation' => 'application/vnd.oasis.opendocument.presentation',
'application/x-vnd.oasis.opendocument.presentation-template' => 'application/vnd.oasis.opendocument.presentation-template',
'application/x-vnd.oasis.opendocument.spreadsheet' => 'application/vnd.oasis.opendocument.spreadsheet',
'application/x-vnd.oasis.opendocument.spreadsheet-template' => 'application/vnd.oasis.opendocument.spreadsheet-template',
'application/x-vnd.oasis.opendocument.text' => 'application/vnd.oasis.opendocument.text',
'application/x-vnd.oasis.opendocument.text-master' => 'application/vnd.oasis.opendocument.text-master',
'application/x-vnd.oasis.opendocument.text-template' => 'application/vnd.oasis.opendocument.text-template',
'application/x-vnd.oasis.opendocument.text-web' => 'application/vnd.oasis.opendocument.text-web',
'application/x-vnd.sun.xml.writer' => 'application/vnd.sun.xml.writer',
'application/vnd.ms-visio' => 'application/vnd.visio',
'image/x-targa' => 'image/x-tga',
'application/x-unix-archive' => 'application/x-archive',
'application/x-arj-compressed' => 'application/x-arj',
'application/x-dbm' => 'application/x-berkeley-db',
'application/vnd.debian.binary-package' => 'application/x-debian-package',
'application/x-Gnumeric-spreadsheet' => 'application/x-gnumeric',
'application/x-gzip' => 'application/gzip',
'application/x-gunzip' => 'application/gzip',
'application/gzipped' => 'application/gzip',
'application/gzip-compressed' => 'application/gzip',
'application/x-gzip-compressed' => 'application/gzip',
'gzip/document' => 'application/gzip',
'application/x-windows-installer' => 'application/x-ms-installer',
'application/x-msi' => 'application/x-ms-installer',
'application/x-rar' => 'application/x-rar-compressed',
'text/x-tex' => 'application/x-tex',
'text/x-texinfo' => 'application/x-texinfo',
'application/x-x509-ca-cert' => 'application/x-x509-cert',
'application/x-x509-user-cert' => 'application/x-x509-cert',
'text/xml' => 'application/xml',
'application/x-xml' => 'application/xml',
'text/x-dtd' => 'application/xml-dtd',
'text/xml-external-parsed-entity' => 'application/xml-external-parsed-entity',
'text/xsl' => 'application/xslt+xml',
'application/x-zip-compressed' => 'application/zip',
'application/x-deflate' => 'application/zlib',
'audio/x-m4a' => 'audio/mp4',
'audio/x-mp4a' => 'audio/mp4',
'audio/x-mpeg' => 'audio/mpeg',
'audio/x-ogg-flac' => 'audio/x-oggflac',
'audio/x-ogg-pcm' => 'audio/x-oggpcm',
'application/x-speex' => 'audio/speex',
'audio/aiff' => 'audio/x-aiff',
'audio/x-realaudio' => 'audio/x-pn-realaudio',
'audio/x-wav' => 'audio/vnd.wave',
'audio/wave' => 'audio/vnd.wave',
'audio/wav' => 'audio/vnd.wave',
'image/x-bmp' => 'image/bmp',
'image/x-ms-bmp' => 'image/bmp',
'image/x-emf' => 'image/emf',
'application/x-emf' => 'image/emf',
'application/x-ms-emz' => 'image/x-emf-compressed',
'image/hevc' => 'image/heic',
'image/hevc-sequence' => 'image/heic-sequence',
'video/jpm' => 'image/jpm',
'image/ntf' => 'image/nitf',
'image/x-psd' => 'image/vnd.adobe.photoshop',
'application/photoshop' => 'image/vnd.adobe.photoshop',
'image/x-dwg' => 'image/vnd.dwg',
'application/acad' => 'image/vnd.dwg',
'application/x-acad' => 'image/vnd.dwg',
'application/autocad_dwg' => 'image/vnd.dwg',
'application/dwg' => 'image/vnd.dwg',
'application/x-dwg' => 'image/vnd.dwg',
'application/x-autocad' => 'image/vnd.dwg',
'drawing/dwg' => 'image/vnd.dwg',
'image/x-icon' => 'image/vnd.microsoft.icon',
'image/x-dcx' => 'image/vnd.zbrush.dcx',
'image/x-pcx' => 'image/vnd.zbrush.pcx',
'image/x-pc-paintbrush' => 'image/vnd.zbrush.pcx',
'image/x-wmf' => 'image/wmf',
'application/x-msmetafile' => 'image/wmf',
'image/x-jb2' => 'image/x-jbig2',
'image/xcf' => 'image/x-xcf',
'application/x-mimearchive' => 'multipart/related',
'message/rfc2557' => 'multipart/related',
'drawing/x-dwf' => 'model/vnd.dwf',
'text/x-asm' => 'text/x-assembly',
'application/x-troff' => 'text/troff',
'application/x-troff-man' => 'text/troff',
'application/x-troff-me' => 'text/troff',
'application/x-troff-ms' => 'text/troff',
'text/x-c' => 'text/x-csrc',
'text/x-java' => 'text/x-java-source',
'text/x-properties' => 'text/x-java-properties',
'text/properties' => 'text/x-java-properties',
'application/x-httpd-jsp' => 'text/x-jsp',
'application/matlab-mat' => 'application/x-matlab-data',
'application/x-tcl' => 'text/x-tcl',
'video/x-daala' => 'video/daala',
'video/x-theora' => 'video/theora',
'video/x-ogg-uvs' => 'video/x-ogguvs',
'video/x-ogg-yuv' => 'video/x-oggyuv',
'video/x-ogg-rgb' => 'video/x-oggrgb',
'video/avi' => 'video/x-msvideo',
'video/msvideo' => 'video/x-msvideo',
'application/font-woff' => 'font/woff',
'application/font-woff2' => 'font/woff2',
}
TYPE_PARENTS = {
'application/bizagi-modeler' => %w(application/zip),
'application/dash+xml' => %w(application/xml),
Expand Down
7 changes: 7 additions & 0 deletions script/generate_tables.rb
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def get_matches(mime, parent)

extensions = {}
types = {}
aliases = {}
magics = []

ARGV.each do |path|
Expand All @@ -137,6 +138,7 @@ def get_matches(mime, parent)
(doc/'mime-info/mime-type').each do |mime|
comments = Hash[*(mime/'_comment').map {|comment| [comment['xml:lang'], comment.inner_text] }.flatten]
type = mime['type']
(mime/'alias').each { |x| aliases[x['type']] = type }
subclass = (mime/'sub-class-of').map{|x| x['type']}
exts = (mime/'glob').map{|x| x['pattern'] =~ /^\*\.([^\[\]]+)$/ ? $1.downcase : nil }.compact
(mime/'magic').each do |magic|
Expand Down Expand Up @@ -222,6 +224,11 @@ def get_matches(mime, parent)
puts " '#{key}' => %w(#{exts}),#{comment}"
end
puts " }"
puts " TYPE_ALIASES = {"
aliases.each do |aliased, type|
puts " '#{aliased}' => '#{type}',"
end
puts " }"
puts " TYPE_PARENTS = {"
types.keys.sort.each do |key|
parents = types[key][1].sort.join(' ')
Expand Down
5 changes: 5 additions & 0 deletions test/declared_type_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,9 @@ class Marcel::MimeType::DeclaredTypeTest < Marcel::TestCase
test "ignores charset declarations" do
assert_equal "text/html", Marcel::MimeType.for(declared_type: "text/html; charset=utf-8")
end

test "resolves declared type to a canonical MIME type" do
aliased, canonical = Marcel::TYPE_ALIASES.first
assert_equal canonical, Marcel::MimeType.for(declared_type: aliased)
end
end
Loading

0 comments on commit 7a173b1

Please sign in to comment.