# Copyright Red Hat
#
# snapm/fsdiff/filetypes.py - Snapshot Manager fs diff file types
#
# This file is part of the snapm project.
#
# SPDX-License-Identifier: Apache-2.0
"""
File type information support.
"""
from typing import ClassVar, Dict, Optional, Tuple
from fnmatch import fnmatch
from pathlib import Path
from enum import Enum
import logging
try:
import magic
_HAVE_MAGIC = True
except ModuleNotFoundError:
_HAVE_MAGIC = False
from snapm import SNAPM_SUBSYSTEM_FSDIFF, SnapmNotFoundError
_log = logging.getLogger(__name__)
_log_debug = _log.debug
_log_info = _log.info
_log_warn = _log.warning
_log_error = _log.error
[docs]
def _log_debug_fsdiff(msg, *args, **kwargs):
"""A wrapper for fsdiff subsystem debug logs."""
_log.debug(msg, *args, extra={"subsystem": SNAPM_SUBSYSTEM_FSDIFF}, **kwargs)
# Mappings for Text-based Extensions
# Format: ".ext": ("mime/type", "description starting with lowercase")
TEXT_EXTENSION_MAP = {
# General Text & Documentation
".txt": ("text/plain", "plain text document"),
".text": ("text/plain", "plain text document"),
".md": ("text/markdown", "markdown documentation"),
".markdown": ("text/markdown", "markdown documentation"),
".rst": ("text/x-rst", "reStructuredText document"),
".adoc": ("text/asciidoc", "asciidoc document"),
".asciidoc": ("text/asciidoc", "asciidoc document"),
".nfo": ("text/x-nfo", "nfo information file"),
".tex": ("text/x-tex", "latex source document"),
".bib": ("text/x-bibtex", "bibtex bibliography"),
".lyx": ("application/x-lyx", "lyx document"),
".rtf": ("text/rtf", "rich text format document"),
".1": ("text/troff", "troff or preprocessor input"),
".2": ("text/troff", "troff or preprocessor input"),
".3": ("text/troff", "troff or preprocessor input"),
".4": ("text/troff", "troff or preprocessor input"),
".5": ("text/troff", "troff or preprocessor input"),
".6": ("text/troff", "troff or preprocessor input"),
".7": ("text/troff", "troff or preprocessor input"),
".8": ("text/troff", "troff or preprocessor input"),
".9": ("text/troff", "troff or preprocessor input"),
# Data & Configuration
".json": ("application/json", "json data file"),
".json5": ("application/json5", "json5 data file"),
".jsonl": ("application/x-jsonlines", "json lines data file"),
".ndjson": ("application/x-ndjson", "newline delimited json file"),
".xml": ("application/xml", "xml document"),
".yaml": ("application/yaml", "yaml configuration file"),
".yml": ("application/yaml", "yaml configuration file"),
".toml": ("application/toml", "toml configuration file"),
".ini": ("text/x-ini", "ini configuration file"),
".cfg": ("text/x-config", "configuration file"),
".conf": ("text/x-config", "configuration file"),
".properties": ("text/x-java-properties", "java properties file"),
".env": ("text/x-env", "environment variable file"),
".csv": ("text/csv", "comma-separated values"),
".tsv": ("text/tab-separated-values", "tab-separated values"),
".log": ("text/x-log", "log file"),
".dat": ("text/x-fixed-field", "data file"), # Ambiguous, defaults to text here
".reg": ("text/x-windows-registry", "windows registry file"),
".service": ("text/plain", "systemd service unit file"),
".socket": ("text/plain", "systemd socket unit file"),
".device": ("text/plain", "systemd device unit file"),
".mount": ("text/plain", "systemd mount unit file"),
".automount": ("text/plain", "systemd automount unit file"),
".swap": ("text/plain", "systemd swap unit file"),
".target": ("text/plain", "systemd target unit file"),
".path": ("text/plain", "systemd path unit file"),
".timer": ("text/plain", "systemd timer unit file"),
".slice": ("text/plain", "systemd slice unit file"),
".scope": ("text/plain", "systemd scope unit file"),
# Web Standards
".html": ("text/html", "html document"),
".htm": ("text/html", "html document"),
".xhtml": ("application/xhtml+xml", "xhtml document"),
".css": ("text/css", "cascading style sheet"),
".scss": ("text/x-scss", "sass style sheet"),
".sass": ("text/x-sass", "sass style sheet"),
".less": ("text/x-less", "less style sheet"),
".styl": ("text/x-stylus", "stylus style sheet"),
".svg": ("image/svg+xml", "scalable vector graphics"),
".rss": ("application/rss+xml", "rss feed"),
".atom": ("application/atom+xml", "atom syndication feed"),
".js": ("text/javascript", "javascript source code"),
".jsx": ("text/jsx", "react jsx source code"),
".ts": ("application/typescript", "typescript source code"),
".tsx": ("application/typescript", "typescript jsx source code"),
".mjs": ("text/javascript", "modular javascript source code"),
".cjs": ("text/javascript", "commonjs source code"),
# Strictly speaking, it's ".wasm.wat" but currently we only consider the
# final extension.
".wat": ("text/wasm", "webassembly text format"),
".coffee": ("text/coffee", "coffeescript source code"),
# Scripting & Shell
".sh": ("application/x-sh", "shell script"),
".bash": ("application/x-sh", "bash script"),
".zsh": ("application/x-zsh", "zsh script"),
".fish": ("application/x-fish", "fish script"),
".ksh": ("application/x-ksh", "kornshell script"),
".csh": ("application/x-csh", "c shell script"),
".bat": ("application/x-bat", "dos batch file"),
".cmd": ("application/x-bat", "windows command script"),
".ps1": ("text/x-powershell", "powershell script"),
".psm1": ("text/x-powershell", "powershell module"),
".psd1": ("text/x-powershell", "powershell data file"),
".vbs": ("text/vbs", "vbscript file"),
".lua": ("text/x-lua", "lua script"),
".pl": ("text/x-perl", "perl script"),
".pm": ("text/x-perl", "perl module"),
".t": ("text/x-perl", "perl test file"),
".tcl": ("text/x-tcl", "tcl script"),
".awk": ("text/x-awk", "awk script"),
".sed": ("text/x-sed", "sed script"),
# Source Code
".py": ("text/x-python", "python source code"),
".pyw": ("text/x-python", "python gui source code"),
".pyi": ("text/x-python", "python interface file"),
".rb": ("text/x-ruby", "ruby source code"),
".erb": ("application/x-erb", "embedded ruby template"),
".rake": ("text/x-ruby", "ruby rake file"),
".gemspec": ("text/x-ruby", "ruby gem specification"),
".java": ("text/x-java-source", "java source code"),
".kt": ("text/x-kotlin", "kotlin source code"),
".kts": ("text/x-kotlin", "kotlin script"),
".groovy": ("text/x-groovy", "groovy source code"),
".scala": ("text/x-scala", "scala source code"),
".clj": ("text/x-clojure", "clojure source code"),
".c": ("text/x-c", "c source code"),
".h": ("text/x-c", "c header file"),
".cpp": ("text/x-c++", "c++ source code"),
".hpp": ("text/x-c++", "c++ header file"),
".cc": ("text/x-c++", "c++ source code"),
".cxx": ("text/x-c++", "c++ source code"),
".m": ("text/x-objective-c", "objective-c source code"),
".mm": ("text/x-objective-c++", "objective-c++ source code"),
".cs": ("text/x-csharp", "c# source code"),
".vb": ("text/x-vb", "visual basic source code"),
".fs": ("text/x-fsharp", "f# source code"),
".fsx": ("text/x-fsharp", "f# script"),
".go": ("text/x-go", "go source code"),
".rs": ("text/rust", "rust source code"),
".swift": ("text/x-swift", "swift source code"),
".dart": ("application/vnd.dart", "dart source code"),
".d": ("text/x-d", "d source code"),
".php": ("application/x-php", "php source code"),
".phtml": ("application/x-php", "php template"),
".r": ("text/x-r", "r source code"),
".rmd": ("text/x-r", "r markdown file"),
".jl": ("text/x-julia", "julia source code"),
".sql": ("application/x-sql", "sql database script"),
".pgsql": ("application/x-sql", "postgresql script"),
".psql": ("application/x-sql", "postgresql script"),
".f": ("text/x-fortran", "fortran source code"),
".for": ("text/x-fortran", "fortran source code"),
".f90": ("text/x-fortran", "fortran 90 source code"),
".f95": ("text/x-fortran", "fortran 95 source code"),
".asm": ("text/x-asm", "assembly source code"),
".s": ("text/x-asm", "assembly source code"),
".nasm": ("text/x-nasm", "nasm assembly source code"),
".elm": ("text/x-elm", "elm source code"),
".erl": ("text/x-erlang", "erlang source code"),
".hrl": ("text/x-erlang", "erlang header file"),
".ex": ("text/x-elixir", "elixir source code"),
".exs": ("text/x-elixir", "elixir script"),
".hs": ("text/x-haskell", "haskell source code"),
".lhs": ("text/x-literate-haskell", "literate haskell source code"),
".ml": ("text/x-ocaml", "ocaml source code"),
".mli": ("text/x-ocaml", "ocaml interface file"),
".lisp": ("text/x-lisp", "lisp source code"),
".lsp": ("text/x-lisp", "lisp source code"),
".scm": ("text/x-scheme", "scheme source code"),
".ada": ("text/x-ada", "ada source code"),
".adb": ("text/x-ada", "ada body file"),
".ads": ("text/x-ada", "ada specification file"),
".pas": ("text/x-pascal", "pascal source code"),
".pp": ("text/x-pascal", "pascal source code"),
".vhdl": ("text/x-vhdl", "vhdl source code"),
".vhd": ("text/x-vhdl", "vhdl source code"),
".v": ("text/x-verilog", "verilog source code"),
".sv": ("text/x-systemverilog", "systemverilog source code"),
# Components & Templates
".vue": ("text/x-vue", "vue.js component"),
".svelte": ("text/x-svelte", "svelte component"),
".astro": ("text/x-astro", "astro component"),
".ejs": ("text/x-ejs", "embedded javascript template"),
".hbs": ("text/x-handlebars", "handlebars template"),
".mustache": ("text/x-mustache", "mustache template"),
".twig": ("text/x-twig", "twig template"),
".jinja": ("text/jinja", "jinja template"),
".jinja2": ("text/jinja", "jinja2 template"),
".liquid": ("text/x-liquid", "liquid template"),
".jsp": ("application/x-jsp", "java server page"),
".asp": ("text/asp", "active server page"),
".aspx": ("text/asp", "active server page extended"),
".razor": ("text/x-razor", "razor view"),
".haml": ("text/x-haml", "haml template"),
".jade": ("text/x-jade", "jade template"),
".pug": ("text/x-pug", "pug template"),
# Build & Version Control
".cmake": ("text/x-cmake", "cmake script"),
".makefile": ("text/x-makefile", "makefile script"),
".mk": ("text/x-makefile", "makefile script"),
".gradle": ("text/x-gradle", "gradle build script"),
".pom": ("text/xml", "maven project object model"),
".bazel": ("text/x-bazel", "bazel build script"),
".dockerfile": ("text/x-dockerfile", "docker build script"),
".containerfile": ("text/x-dockerfile", "container build script"),
".vagrantfile": ("text/x-ruby", "vagrant configuration file"),
".diff": ("text/x-diff", "patch diff file"),
".patch": ("text/x-diff", "patch file"),
".gitignore": ("text/plain", "git ignore file"),
".gitattributes": ("text/plain", "git attributes file"),
".gitmodules": ("text/plain", "git modules file"),
".lock": ("text/plain", "lock file"),
# Parsers
".y": ("text/x-yacc", "yacc grammar file"),
".yacc": ("text/x-yacc", "yacc grammar file"),
".yy": ("text/x-yacc", "bison grammar file"),
".l": ("text/x-lex", "lex file"),
".lex": ("text/x-lex", "lex file"),
".ll": ("text/x-lex", "flex file"),
".m4": ("text/x-m4", "m4 macro file"),
".proto": ("text/x-protobuf", "protocol buffers file"),
".thrift": ("application/x-thrift", "thrift definition file"),
".g4": ("text/x-antlr", "antlr4 grammar file"),
# Misc
".eps": ("application/postscript", "encapsulated postscript"),
".ps": ("application/postscript", "postscript file"),
".pem": ("application/x-pem-file", "privacy enhanced mail certificate"),
".csr": ("application/pkcs10", "certificate signing request"),
".key": ("application/pkcs8", "private key file"),
".ics": ("text/calendar", "icalendar file"),
".vcf": ("text/vcard", "vcard file"),
".srt": ("text/srt", "subrip subtitle file"),
".vtt": ("text/vtt", "web video text track"),
".sub": ("text/x-microdvd", "microdvd subtitle file"),
}
# Mappings for Text-based Filenames (Extensionless)
# Format: "filename": ("mime/type", "description starting with lowercase")
TEXT_FILENAME_MAP = {
"*makefile": ("text/x-makefile", "makefile build script"),
"*dockerfile": ("text/x-dockerfile", "docker build script"),
"*containerfile": ("text/x-dockerfile", "container build script"),
"*rakefile": ("text/x-ruby", "ruby rake build script"),
"*gemfile": ("text/x-ruby", "ruby gem dependency file"),
"*vagrantfile": ("text/x-ruby", "vagrant configuration file"),
"*procfile": ("text/plain", "process declaration file"),
"*license": ("text/plain", "license text"),
"*readme": ("text/plain", "readme text"),
"*changelog": ("text/plain", "changelog text"),
"*copying": ("text/plain", "copyright text"),
"*os-release": ("text/plain", "OS release data"),
"*system-release-cpe": ("text/plain", "common platform enumerator OS release data"),
"*system-release": ("text/plain", "OS release name"),
"*fedora-release": ("text/plain", "OS release name"),
"*centos-release": ("text/plain", "OS release name"),
"*redhat-release": ("text/plain", "OS release name"),
"*issue": ("text/plain", "login banner message"),
"*issue.net": ("text/plain", "login banner message"),
"*motd": ("text/plain", "message of the day"),
"*fstab": ("text/plain", "static file system information"),
}
# Directories typically containing text files.
# Format: "parent_path": ("mime/type", "description starting with lowercase")
TEXT_FILE_PATHS = {
"/etc": ("text/plain", "configuration file"),
"/usr/share/doc": ("text/plain", "documentation"),
"/usr/local/etc": ("text/plain", "local configuration"),
"/usr/local/share/doc": ("text/plain", "local documentation"),
"/var/log": ("text/plain", "log file"),
}
# List of systemd unit file extensions for special handling.
SYSTEMD_UNIT_EXTENSIONS = (
".service",
".socket",
".device",
".mount",
".automount",
".swap",
".target",
".path",
".timer",
".slice",
".scope",
)
# Mappings for Binary Extensions
# Format: ".ext": ("mime/type", "description starting with lowercase")
BINARY_EXTENSION_MAP = {
# Executables & Libraries
".exe": (
"application/vnd.microsoft.portable-executable",
"windows executable file",
),
".bin": ("application/octet-stream", "binary data file"),
".elf": ("application/x-elf", "elf executable"),
".o": ("application/x-object", "object file"),
".so": ("application/x-sharedlib", "shared library"),
".dll": ("application/x-msdownload", "dynamic link library"),
".class": ("application/java-vm", "java class file"),
".pyc": ("application/x-python-code", "compiled python bytecode"),
".pyo": ("application/x-python-code", "optimized python bytecode"),
".pyd": ("application/x-python-code", "python extension module"),
".jar": ("application/java-archive", "java archive"),
".war": ("application/java-archive", "web application archive"),
".ear": ("application/java-archive", "enterprise application archive"),
".msi": ("application/x-msi", "windows installer package"),
".deb": ("application/vnd.debian.binary-package", "debian software package"),
".rpm": ("application/x-rpm", "red hat package manager file"),
".app": ("application/x-apple-diskimage", "macos application bundle"),
".dmg": ("application/x-apple-diskimage", "apple disk image"),
".pkg": ("application/x-newton-compatible-pkg", "macos package file"),
# Archives & Compression
".zip": ("application/zip", "zip archive"),
".tar": ("application/x-tar", "tar archive"),
".gz": ("application/gzip", "gzip compressed file"),
".bz2": ("application/x-bzip2", "bzip2 compressed file"),
".xz": ("application/x-xz", "xz compressed file"),
".7z": ("application/x-7z-compressed", "7-zip archive"),
".rar": ("application/x-rar-compressed", "rar archive"),
".z": ("application/x-compress", "unix compressed file"),
".lz": ("application/x-lzip", "lzip compressed file"),
".tgz": ("application/gzip", "tarball archive"),
".tbz2": ("application/x-bzip2", "bzip2 tarball archive"),
".iso": ("application/x-iso9660-image", "disk image file"),
".cab": ("application/vnd.ms-cab-compressed", "windows cabinet file"),
".arj": ("application/x-arj", "arj archive"),
".lzh": ("application/x-lzh", "lzh archive"),
".ace": ("application/x-ace-compressed", "ace archive"),
".uue": ("text/x-uuencode", "uuencoded file"),
".bz": ("application/x-bzip", "bzip compressed file"),
".lzma": ("application/x-lzma", "lzma compressed file"),
".zst": ("application/zstd", "zstandard compressed file"),
# Media: Images
".jpg": ("image/jpeg", "jpeg image"),
".jpeg": ("image/jpeg", "jpeg image"),
".png": ("image/png", "portable network graphics image"),
".gif": ("image/gif", "gif image"),
".bmp": ("image/bmp", "bitmap image"),
".ico": ("image/x-icon", "icon file"),
".tif": ("image/tiff", "tiff image"),
".tiff": ("image/tiff", "tiff image"),
".webp": ("image/webp", "webp image"),
".raw": ("image/x-panasonic-raw", "raw image data"),
".heic": ("image/heic", "heic image"),
".psd": ("image/vnd.adobe.photoshop", "photoshop document"),
".ai": ("application/illustrator", "adobe illustrator file"),
".xcf": ("image/x-xcf", "gimp image file"),
".indd": ("application/x-indesign", "indesign document"),
# Media: Audio
".mp3": ("audio/mpeg", "mp3 audio file"),
".wav": ("audio/wav", "wav audio file"),
".ogg": ("audio/ogg", "ogg vorbis audio file"),
".flac": ("audio/flac", "flac audio file"),
".aac": ("audio/aac", "aac audio file"),
".wma": ("audio/x-ms-wma", "windows media audio"),
".m4a": ("audio/mp4", "m4a audio file"),
".aiff": ("audio/x-aiff", "aiff audio file"),
".mid": ("audio/midi", "midi audio file"),
".midi": ("audio/midi", "midi audio file"),
# Media: Video
".mp4": ("video/mp4", "mp4 video file"),
".avi": ("video/x-msvideo", "avi video file"),
".mkv": ("video/x-matroska", "matroska video file"),
".mov": ("video/quicktime", "quicktime video file"),
".wmv": ("video/x-ms-wmv", "windows media video"),
".flv": ("video/x-flv", "flash video file"),
".webm": ("video/webm", "webm video file"),
".m4v": ("video/x-m4v", "m4v video file"),
".mpg": ("video/mpeg", "mpeg video file"),
".mpeg": ("video/mpeg", "mpeg video file"),
".3gp": ("video/3gpp", "3gpp video file"),
".mts": ("video/mp2t", "hdv video file"),
".vob": ("video/mpeg", "dvd video object"),
# Documents (Binary)
".pdf": ("application/pdf", "pdf document"),
".epub": ("application/epub+zip", "epub ebook"),
".mobi": ("application/x-mobipocket-ebook", "mobipocket ebook"),
".azw": ("application/vnd.amazon.ebook", "kindle ebook"),
".djvu": ("image/vnd.djvu", "djvu document"),
".docx": (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"word document",
),
".doc": ("application/msword", "legacy word document"),
".xlsx": (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"excel spreadsheet",
),
".xls": ("application/vnd.ms-excel", "legacy excel spreadsheet"),
".pptx": (
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"powerpoint presentation",
),
".ppt": ("application/vnd.ms-powerpoint", "legacy powerpoint presentation"),
".xlsm": (
"application/vnd.ms-excel.sheet.macroenabled.12",
"macro-enabled excel spreadsheet",
),
".docm": (
"application/vnd.ms-word.document.macroenabled.12",
"macro-enabled word document",
),
".odt": ("application/vnd.oasis.opendocument.text", "opendocument text"),
".ods": (
"application/vnd.oasis.opendocument.spreadsheet",
"opendocument spreadsheet",
),
".odp": (
"application/vnd.oasis.opendocument.presentation",
"opendocument presentation",
),
".odg": ("application/vnd.oasis.opendocument.graphics", "opendocument graphics"),
# Database & Disk Images
".sqlite": ("application/vnd.sqlite3", "sqlite database"),
".sqlite3": ("application/vnd.sqlite3", "sqlite database"),
".db": ("application/octet-stream", "database file"),
".mdb": ("application/x-msaccess", "access database"),
".accdb": ("application/x-msaccess", "access database"),
".frm": ("application/octet-stream", "mysql table definition"),
".ibd": ("application/octet-stream", "mysql innodb tablespace"),
".dbf": ("application/x-dbf", "database file"),
".img": ("application/octet-stream", "disk image"),
".vmdk": ("application/x-vmware-vmdk", "vmware virtual disk"),
".vdi": ("application/x-virtualbox-vdi", "virtualbox virtual disk"),
".qcow2": ("application/x-qemu-disk", "qemu copy-on-write disk"),
".hdd": ("application/octet-stream", "virtual hard disk"),
# Miscellaneous / System
".ds_store": ("application/octet-stream", "macos folder metadata"),
".thumbs.db": ("application/octet-stream", "windows thumbnail cache"),
".lnk": ("application/x-ms-shortcut", "windows shortcut"),
".sys": ("application/octet-stream", "windows system file"),
".drv": ("application/octet-stream", "device driver"),
".cur": ("image/x-win-bitmap", "windows cursor"),
".ttf": ("font/ttf", "truetype font"),
".otf": ("font/otf", "opentype font"),
".woff": ("font/woff", "web open font format"),
".woff2": ("font/woff2", "web open font format 2"),
".eot": ("application/vnd.ms-fontobject", "embedded opentype font"),
}
# Mappings for Binary Filenames/Patterns
# Format: "pattern": ("mime/type", "description starting with lowercase")
BINARY_FILENAME_MAP = {
"*.so.*": ("application/x-sharedlib", "versioned shared library"),
"*vmlinuz*": ("application/x-linux-kernel", "linux kernel executable"),
"*initrd*": ("application/x-cpio", "initial ramdisk image"),
"*initramfs*": ("application/x-cpio", "initial ramdisk image"),
"*core.*": ("application/x-coredump", "system core dump"),
"*swapfile": ("application/octet-stream", "system swap file"),
"*.git/objects/*": ("application/x-git-object", "git internal object"),
"*.git/index": ("application/x-git-index", "git index file"),
}
# Mappings that are typically for binaries/libraries
BINARY_FILE_PATHS = {
"/bin": ("application/x-executable", "executable"),
"/sbin": ("application/x-executable", "executable"),
"/lib": ("application/x-sharedlib", "shared library"),
"/lib32": ("application/x-sharedlib", "shared library"),
"/lib64": ("application/x-sharedlib", "shared library"),
"/libx32": ("application/x-sharedlib", "shared library"),
"/usr/bin": ("application/x-executable", "executable"),
"/usr/sbin": ("application/x-executable", "executable"),
"/usr/lib": ("application/x-sharedlib", "shared library"),
"/usr/lib32": ("application/x-sharedlib", "shared library"),
"/usr/lib64": ("application/x-sharedlib", "shared library"),
"/usr/libexec": ("application/x-executable", "executable"),
"/usr/local/bin": ("application/x-executable", "executable"),
"/usr/local/sbin": ("application/x-executable", "executable"),
"/usr/local/lib": ("application/x-sharedlib", "shared library"),
"/usr/local/lib64": ("application/x-sharedlib", "shared library"),
}
# Patterns that indicate binary content in /var/log
BINARY_LOG_PATTERNS = [
"*btmp*",
"*wtmp*",
"*lastlog",
"*sa[0-9][0-9]",
"*.journal",
]
[docs]
def _is_binary_log(file_path: Path) -> bool:
"""
Determine based on file name pattern match whether ``file_path``
refers to a log file that normally contains binary data.
:param file_path: The path to test.
:type file_path: ``Path``
:returns: ``True`` if ``file_path`` is likely a binary log, or ``False``
otherwise.
:rtype: ``bool``
"""
for binary_pattern in BINARY_LOG_PATTERNS:
if fnmatch(str(file_path), binary_pattern):
return True
return False
[docs]
def _generic_guess_file(
file_path: Path,
extension_map: Dict[str, Tuple[str, str]],
filename_map: Dict[str, Tuple[str, str]],
encoding: str,
) -> Optional[Tuple[str, str, str]]:
"""
Attempt to guess a file's MIME type and description based on the file
name and extension.
:param file_path: A ``Path`` instance containing the file path to check.
:type file_path: ``Path``
:param extension_map: A map of ".extension": (mime_type, description)
tuples to use.
:type extension_map: ``Dict[str, Tuple[str, str]]``
:param filename_map: A map of "filename": (mime_type, description)
tuples to use.
:returns: A 3-tuple containing (mime_type, description, encoding) if the
type could be guessed or ``None`` otherwise.
:rtype: ``Optional[Tuple[str, str, str]]``
"""
# Check exact filename match (case-insensitive) for extensionless files
for file_name_pattern in filename_map.keys():
if Path(file_path.name.lower()).match(file_name_pattern):
return (*filename_map[file_name_pattern], encoding)
extension = file_path.suffix
# Check extension match
extension = extension.lower()
if extension and extension in extension_map:
return (*extension_map[extension], encoding)
return None
[docs]
def _guess_text_file(file_path: Path) -> Optional[Tuple[str, str, str]]:
"""
Attempt to guess a text file's MIME type and description based on the file
name and extension.
:param file_path: A ``Path`` instance containing the file path to check.
:type file_path: ``Path``
:returns: A 3-tuple containing (mime_type, description, encoding) if the
type could be guessed or ``None`` otherwise.
:rtype: ``Optional[Tuple[str, str, str]]``
"""
guess = _generic_guess_file(
file_path, TEXT_EXTENSION_MAP, TEXT_FILENAME_MAP, "utf-8"
)
if guess is not None:
return guess
if _is_binary_log(file_path):
return ("application/octet-stream", "binary log file", "binary")
# Walk up the directory structure checking for parents paths that typically
# hold text like files.
for abs_parent_path in file_path.absolute().parents:
abs_parent_str = str(abs_parent_path)
for text_file_path, type_tuple in TEXT_FILE_PATHS.items():
if abs_parent_str.endswith(text_file_path):
return (*type_tuple, "utf-8")
return None
[docs]
def _guess_binary_file(file_path: Path) -> Optional[Tuple[str, str, str]]:
"""
Attempt to guess a binary file's MIME type and description based on the
file name and extension.
:param file_path: A ``Path`` instance containing the file path to check.
:type file_path: ``Path``
:returns: A 3-tuple containing (mime_type, description, encoding) if the
type could be guessed or ``None`` otherwise.
:rtype: ``Optional[Tuple[str, str, str]]``
"""
guess = _generic_guess_file(
file_path, BINARY_EXTENSION_MAP, BINARY_FILENAME_MAP, "binary"
)
if guess is not None:
return guess
# Honour known text-like patterns even under binary-heavy directories.
text_guess = _guess_text_file(file_path)
if text_guess is not None:
return text_guess
# Walk up the directory structure checking for parents paths that typically
# hold binary files.
for abs_parent_path in file_path.absolute().parents:
abs_parent_str = str(abs_parent_path)
for binary_file_path, type_tuple in BINARY_FILE_PATHS.items():
if abs_parent_str.endswith(binary_file_path):
return (*type_tuple, "binary")
return None
[docs]
def _guess_file(file_path: Path) -> Tuple[str, str, str]:
"""
Attempt to guess a file's MIME type and description based on the file name
and extension.
:param file_path: A ``Path`` instance containing the file path to check.
:type file_path: ``Path``
:returns: A 3-tuple containing (mime_type, description, encoding).
:rtype: ``Tuple[str, str, str]``
"""
guess = _guess_binary_file(file_path)
if guess is not None:
return guess
guess = _guess_text_file(file_path)
if guess is not None:
return guess
return ("application/octet-stream", "unknown file type", "binary")
[docs]
class FileTypeCategory(Enum):
"""
Enum for file type categories.
"""
TEXT = "text"
BINARY = "binary"
IMAGE = "image"
AUDIO = "audio"
VIDEO = "video"
ARCHIVE = "archive"
EXECUTABLE = "executable"
CONFIG = "config"
LOG = "log"
BINARY_LOG = "binary_log"
DATABASE = "database"
DOCUMENT = "document"
DIRECTORY = "directory"
SOURCE_CODE = "source_code"
CERTIFICATE = "certificate"
SYMLINK = "symlink"
BLOCK = "block"
CHAR = "char"
SOCK = "socket"
FIFO = "FIFO"
UNKNOWN = "unknown"
[docs]
class FileTypeInfo:
"""
Class representing file type information and encoding.
"""
TEXT_DOCUMENTS = (
"application/rtf",
"application/x-lyx",
)
[docs]
def __init__(
self,
mime_type: str,
description: str,
category: FileTypeCategory,
encoding: Optional[str] = None,
):
"""
Initialise a new ``FileTypeInfo`` object.
:param mime_type: The detected MIME type.
:type mime_type: ``str``
:param description: Type description returned by magic.
:type description: ``str``
:param category: File type category.
:type category: ``FileTypeCategory``
:param encoding: Optional file encoding.
:type encoding: ``Optional[str]``
"""
self.mime_type = mime_type
self.description = description
self.category = category
self.encoding = encoding
self.is_text_like = category in (
FileTypeCategory.TEXT,
FileTypeCategory.CONFIG,
FileTypeCategory.LOG,
FileTypeCategory.SOURCE_CODE,
) or (
category == FileTypeCategory.DOCUMENT
and (mime_type.startswith("text/") or (mime_type in self.TEXT_DOCUMENTS))
)
[docs]
def __str__(self):
"""
Return a string representation of this ``FileTypeInfo`` object.
:returns: A human readable string describing this instance.
:rtype: ``str``
"""
return (
f"MIME type: {self.mime_type}, "
f"Category: {self.category.value}, "
f"Encoding: {self.encoding if self.encoding else 'unknown'}, "
f"Description: {self.description}"
)
[docs]
class FileTypeDetector:
"""
Detect file types using ``magic`` from python3-file-magic.
"""
# Custom rules for better categorization. Black likes to break this
# so turn off formatting for the category_rules dict.
# fmt: off
category_rules: ClassVar[Dict[str, FileTypeCategory]] = {
# --- Archives & Compression ---
"application/zip": FileTypeCategory.ARCHIVE,
"application/x-tar": FileTypeCategory.ARCHIVE,
"application/gzip": FileTypeCategory.ARCHIVE,
"application/x-gzip": FileTypeCategory.ARCHIVE,
"application/x-bzip2": FileTypeCategory.ARCHIVE,
"application/x-lzip": FileTypeCategory.ARCHIVE,
"application/x-lzma": FileTypeCategory.ARCHIVE,
"application/x-xz": FileTypeCategory.ARCHIVE,
"application/zstd": FileTypeCategory.ARCHIVE,
"application/x-7z-compressed": FileTypeCategory.ARCHIVE,
"application/x-rar": FileTypeCategory.ARCHIVE,
"application/x-rar-compressed": FileTypeCategory.ARCHIVE,
"application/java-archive": FileTypeCategory.ARCHIVE,
"application/x-iso9660-image": FileTypeCategory.ARCHIVE,
"application/vnd.android.package-archive": FileTypeCategory.ARCHIVE,
# --- Executables & Libraries ---
"application/x-executable": FileTypeCategory.EXECUTABLE,
"application/x-elf": FileTypeCategory.EXECUTABLE,
"application/x-sharedlib": FileTypeCategory.EXECUTABLE,
"application/x-pie-executable": FileTypeCategory.EXECUTABLE,
"application/x-mach-binary": FileTypeCategory.EXECUTABLE,
"application/x-dosexec": FileTypeCategory.EXECUTABLE,
"application/vnd.microsoft.portable-executable": FileTypeCategory.EXECUTABLE,
"application/x-msdownload": FileTypeCategory.EXECUTABLE,
"application/x-object": FileTypeCategory.EXECUTABLE,
# --- Documents (Office & PDF) ---
"application/pdf": FileTypeCategory.DOCUMENT,
"application/msword": FileTypeCategory.DOCUMENT,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document":
FileTypeCategory.DOCUMENT,
"application/vnd.ms-excel": FileTypeCategory.DOCUMENT,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
FileTypeCategory.DOCUMENT,
"application/vnd.ms-powerpoint": FileTypeCategory.DOCUMENT,
"application/vnd.openxmlformats-officedocument.presentationml.presentation":
FileTypeCategory.DOCUMENT,
"application/vnd.oasis.opendocument.text": FileTypeCategory.DOCUMENT,
"application/vnd.oasis.opendocument.spreadsheet": FileTypeCategory.DOCUMENT,
"application/rtf": FileTypeCategory.DOCUMENT,
"text/rtf": FileTypeCategory.DOCUMENT,
"text/markdown": FileTypeCategory.DOCUMENT,
"text/asciidoc": FileTypeCategory.DOCUMENT,
"text/x-nfo": FileTypeCategory.DOCUMENT,
"text/x-tex": FileTypeCategory.DOCUMENT,
"text/x-bibtex": FileTypeCategory.DOCUMENT,
"text/troff": FileTypeCategory.DOCUMENT,
# --- Configuration & Data Serialization ---
"application/json": FileTypeCategory.CONFIG,
"application/ld+json": FileTypeCategory.CONFIG,
"application/xml": FileTypeCategory.CONFIG,
"text/xml": FileTypeCategory.CONFIG,
"application/yaml": FileTypeCategory.CONFIG,
"text/yaml": FileTypeCategory.CONFIG,
"application/x-yaml": FileTypeCategory.CONFIG,
"text/x-toml": FileTypeCategory.CONFIG,
"application/toml": FileTypeCategory.CONFIG,
"text/x-ini": FileTypeCategory.CONFIG, # Unofficial but common
# --- Databases ---
"application/vnd.sqlite3": FileTypeCategory.DATABASE,
"application/x-sqlite3": FileTypeCategory.DATABASE,
"application/x-dbf": FileTypeCategory.DATABASE,
"application/mbox": FileTypeCategory.DATABASE, # Email storage
# --- Source Code / Web Standards ---
"application/javascript": FileTypeCategory.SOURCE_CODE,
"application/x-javascript": FileTypeCategory.SOURCE_CODE,
"text/javascript": FileTypeCategory.SOURCE_CODE,
"text/x-python": FileTypeCategory.SOURCE_CODE,
"text/x-script.python": FileTypeCategory.SOURCE_CODE,
"text/x-shellscript": FileTypeCategory.SOURCE_CODE,
"application/x-sh": FileTypeCategory.SOURCE_CODE,
"text/x-c": FileTypeCategory.SOURCE_CODE,
"text/x-c++": FileTypeCategory.SOURCE_CODE,
"text/x-java-source": FileTypeCategory.SOURCE_CODE,
"text/html": FileTypeCategory.SOURCE_CODE,
"text/css": FileTypeCategory.SOURCE_CODE,
"text/x-diff": FileTypeCategory.SOURCE_CODE,
"text/x-makefile": FileTypeCategory.SOURCE_CODE,
"application/x-zsh": FileTypeCategory.SOURCE_CODE,
"application/x-fish": FileTypeCategory.SOURCE_CODE,
"application/x-ksh": FileTypeCategory.SOURCE_CODE,
"application/x-csh": FileTypeCategory.SOURCE_CODE,
"application/x-bat": FileTypeCategory.SOURCE_CODE,
"text/x-powershell": FileTypeCategory.SOURCE_CODE,
"text/vbs": FileTypeCategory.SOURCE_CODE,
"text/x-lua": FileTypeCategory.SOURCE_CODE,
"text/x-perl": FileTypeCategory.SOURCE_CODE,
"text/x-tcl": FileTypeCategory.SOURCE_CODE,
"text/x-awk": FileTypeCategory.SOURCE_CODE,
"text/x-sed": FileTypeCategory.SOURCE_CODE,
# --- Certificates & Keys ---
"application/x-x509-ca-cert": FileTypeCategory.CERTIFICATE,
"application/x-pem-file": FileTypeCategory.CERTIFICATE,
"application/pkix-cert": FileTypeCategory.CERTIFICATE,
# --- System Inodes ---
"inode/directory": FileTypeCategory.DIRECTORY,
"inode/blockdevice": FileTypeCategory.BLOCK,
"inode/chardevice": FileTypeCategory.CHAR,
"inode/fifo": FileTypeCategory.FIFO,
"inode/socket": FileTypeCategory.SOCK,
"inode/symlink": FileTypeCategory.SYMLINK,
# --- Generic Prefixes (Fallbacks) ---
"text/": FileTypeCategory.TEXT,
"image/": FileTypeCategory.IMAGE,
"audio/": FileTypeCategory.AUDIO,
"video/": FileTypeCategory.VIDEO,
"font/": FileTypeCategory.BINARY,
"model/": FileTypeCategory.BINARY,
}
# fmt: on
[docs]
def detect_file_type(
self, file_path: Path, strip_prefix: str = "", use_magic=False
) -> FileTypeInfo:
"""
Detect comprehensive file type information, optionally using
python-magic for MIME type detection.
:param file_path: The path to the file to inspect.
:type file_path: ``Path``.
:param strip_prefix: An optional prefix (mount root) to strip from the
``file_path`` when guessing file types.
:type strip_prefix: ``str``
:returns: File type information for ``file_path``.
:rtype: ``FileTypeInfo``
"""
if use_magic:
if not _HAVE_MAGIC:
raise SnapmNotFoundError("python-file-magic is not installed")
# c9s magic does not have magic.error
if hasattr(magic, "error"):
magic_errors = (magic.error, OSError, ValueError)
else:
magic_errors = (OSError, ValueError)
try:
fm = magic.detect_from_filename(str(file_path))
mime_type = fm.mime_type
encoding = fm.encoding
description = fm.name
category = self._categorize_file(mime_type, file_path)
return FileTypeInfo(mime_type, description, category, encoding)
except magic_errors as err:
_log_warn("Error detecting file type for %s: %s", str(file_path), err)
return FileTypeInfo(
"application/octet-stream", "unknown", FileTypeCategory.UNKNOWN
)
else:
return self._guess_file_type(
Path(str(file_path).removeprefix(strip_prefix))
)
# pylint: disable=too-many-return-statements
[docs]
def _categorize_file(self, mime_type: str, file_path: Path) -> FileTypeCategory:
"""
Categorize file based on MIME type and path patterns.
:param mime_type: Detected file MIME type.
:type mime_type: ``str``
:param file_path: Path to the file to categorize.
:type file_path: ``Path``
:returns: File type categorization.
:rtype: ``FileTypeCategory``
"""
mime_type = mime_type.lower()
# Check path-based rules for common locations
path_str = str(file_path).lower()
if "/log/" in path_str or path_str.endswith(".log"):
if _is_binary_log(file_path):
return FileTypeCategory.BINARY_LOG
return FileTypeCategory.LOG
if path_str.startswith("/etc/") or path_str.endswith(".conf"):
return FileTypeCategory.CONFIG
if "database" in path_str or path_str.endswith((".db", ".sqlite")):
return FileTypeCategory.DATABASE
# Special rules for systemd unit files outside /etc
if file_path.suffix in SYSTEMD_UNIT_EXTENSIONS:
return FileTypeCategory.CONFIG
# Check MIME type rules
for pattern, category in self.category_rules.items():
if mime_type.startswith(pattern):
return category
return FileTypeCategory.BINARY
[docs]
def _guess_file_type(self, file_path: Path) -> FileTypeInfo:
"""
Attempt to guess file type based on extension and file path without
using python-magic.
:param file_path: The path to guess file type for.
:type file_path: ``Path``
:returns: A ``FileTypeInfo`` object with a best-effort guess of the
file type.
:rtype: ``FileTypeInfo``
"""
guess = _guess_file(file_path)
mime_type, description, encoding = guess
category = self._categorize_file(mime_type, file_path)
return FileTypeInfo(mime_type, description, category, encoding)