Signature matching

This commit is contained in:
Alex Ling
2021-01-19 08:43:45 +00:00
parent 7f76322377
commit 667d390be4
8 changed files with 247 additions and 40 deletions

50
src/util/signature.cr Normal file
View File

@@ -0,0 +1,50 @@
class File
abstract struct Info
def inode
@stat.st_ino
end
end
# Returns the signature of the file at filename.
# When it is not a supported file, returns 0. Otherwise, calculate the
# signature by combining its inode value, file size and mtime. This
# ensures that moving (unless to another device) and renaming the file
# preserves the signature, while copying or editing the file changes it.
def self.signature(filename) : UInt64
return 0u64 unless %w(.zip .rar .cbz .cbr).includes? File.extname filename
info = File.info filename
signatures = [
info.inode,
File.size(filename),
info.modification_time.to_unix,
]
Digest::CRC32.checksum(signatures.sort.join).to_u64
end
end
class Dir
# Returns the signature of the directory at dirname.
# The signature is calculated by combining its mtime and the signatures of
# all directories and files in it. This ensures that moving (unless to
# another device) and renaming the directory preserves the signature,
# while copying or editing its content changes it.
def self.signature(dirname) : UInt64
signatures = [] of (UInt64 | Int64)
signatures << File.info(dirname).modification_time.to_unix
self.open dirname do |dir|
dir.entries.each do |fn|
next if fn.starts_with? "."
path = File.join dirname, fn
if File.directory? path
signatures << Dir.signature path
else
_sig = File.signature path
# Only add its signature value to `signatures` when it is a
# supported file
signatures << _sig if _sig > 0
end
end
end
Digest::CRC32.checksum(signatures.sort.join).to_u64
end
end

View File

@@ -92,3 +92,18 @@ def sort_titles(titles : Array(Title), opt : SortOptions, username : String)
ary
end
class String
# Returns the similarity (in [0, 1]) of two paths.
# For the two paths, separate them into arrays of components, count the
# number of matching components backwards, and divide the count by the
# number of components of the shorter path.
def components_similarity(other : String) : Float64
s, l = [self, other]
.map { |str| Path.new(str).parts }
.sort_by &.size
match = s.reverse.zip(l.reverse).count { |a, b| a == b }
match / s.size
end
end