Signature matching

This commit is contained in:
Alex Ling 2021-01-19 08:43:45 +00:00
parent 7f76322377
commit 667d390be4
8 changed files with 247 additions and 40 deletions

View File

@ -0,0 +1,50 @@
class IDSignature < MG::Base
def up : String
<<-SQL
ALTER TABLE ids ADD COLUMN signature TEXT;
SQL
end
def down : String
<<-SQL
-- remove signature column from ids
ALTER TABLE ids RENAME TO tmp;
CREATE TABLE ids (
path TEXT NOT NULL,
id TEXT NOT NULL
);
INSERT INTO ids
SELECT path, id
FROM tmp;
DROP TABLE tmp;
-- recreate the indices
CREATE UNIQUE INDEX path_idx ON ids (path);
CREATE UNIQUE INDEX id_idx ON ids (id);
-- recreate the foreign key constraint on thumbnails
ALTER TABLE thumbnails RENAME TO tmp;
CREATE TABLE thumbnails (
id TEXT NOT NULL,
data BLOB NOT NULL,
filename TEXT NOT NULL,
mime TEXT NOT NULL,
size INTEGER NOT NULL,
FOREIGN KEY (id) REFERENCES ids (id)
ON UPDATE CASCADE
ON DELETE CASCADE
);
INSERT INTO thumbnails
SELECT * FROM tmp;
DROP TABLE tmp;
CREATE UNIQUE INDEX tn_index ON thumbnails (id);
SQL
end
end

View File

@ -0,0 +1,31 @@
class RelativePath < MG::Base
def up : String
base = Config.current.library_path
base = base[...-1] if base.ends_with? "/"
<<-SQL
-- update the path column in ids to relative paths
UPDATE ids
SET path = REPLACE(path, '#{base}', '');
-- update the path column in titles to relative paths
UPDATE titles
SET path = REPLACE(path, '#{base}', '');
SQL
end
def down : String
base = Config.current.library_path
base = base[...-1] if base.ends_with? "/"
<<-SQL
-- update the path column in ids to absolute paths
UPDATE ids
SET path = '#{base}' || path;
-- update the path column in titles to absolute paths
UPDATE titles
SET path = '#{base}' || path;
SQL
end
end

View File

@ -11,13 +11,14 @@ class Entry
@title = File.basename @zip_path, File.extname @zip_path
@encoded_title = URI.encode @title
@size = (File.size @zip_path).humanize_bytes
id = storage.get_id @zip_path, false
id = storage.get_entry_id @zip_path, File.signature(@zip_path)
if id.nil?
id = random_str
storage.insert_id({
path: @zip_path,
id: id,
is_title: false,
title_signature: nil,
entry_signature: File.signature(@zip_path).to_s,
})
end
@id = id

View File

@ -42,16 +42,6 @@ class Library
end
end
end
db_interval = Config.current.db_optimization_interval_hours
unless db_interval < 1
spawn do
loop do
Storage.default.optimize
sleep db_interval.hours
end
end
end
end
def titles
@ -119,6 +109,7 @@ class Library
storage.close
Logger.debug "Scan completed"
Storage.default.optimize
end
def get_continue_reading_entries(username)

View File

@ -3,19 +3,21 @@ require "../archive"
class Title
getter dir : String, parent_id : String, title_ids : Array(String),
entries : Array(Entry), title : String, id : String,
encoded_title : String, mtime : Time, signature : UInt64 = 0
encoded_title : String, mtime : Time, signature : UInt64
@entry_display_name_cache : Hash(String, String)?
def initialize(@dir : String, @parent_id)
storage = Storage.default
id = storage.get_id @dir, true
@signature = Dir.signature dir
id = storage.get_title_id dir, signature
if id.nil?
id = random_str
storage.insert_id({
path: @dir,
path: dir,
id: id,
is_title: true,
title_signature: signature.to_s,
entry_signature: nil,
})
end
@id = id
@ -25,8 +27,6 @@ class Title
@entries = [] of Entry
@mtime = File.info(dir).modification_time
signatures = [] of UInt64
Dir.entries(dir).each do |fn|
next if fn.starts_with? "."
path = File.join dir, fn
@ -35,18 +35,14 @@ class Title
next if title.entries.size == 0 && title.titles.size == 0
Library.default.title_hash[title.id] = title
@title_ids << title.id
signatures << title.signature
next
end
if [".zip", ".cbz", ".rar", ".cbr"].includes? File.extname path
entry = Entry.new path, self
@entries << entry if entry.pages > 0 || entry.err_msg
signatures << File.size entry.zip_path
end
end
@signature = Digest::CRC32.checksum(signatures.sort.join "").to_u64
mtimes = [@mtime]
mtimes += @title_ids.map { |e| Library.default.title_hash[e].mtime }
mtimes += @entries.map { |e| e.mtime }

View File

@ -20,9 +20,11 @@ class Storage
@path : String
@db : DB::Database?
alias IDTuple = NamedTuple(path: String,
alias IDTuple = NamedTuple(
path: String,
id: String,
is_title: Bool)
entry_signature: String?,
title_signature: String?)
use_default
@ -230,16 +232,82 @@ class Storage
end
end
def get_id(path, is_title)
def get_title_id(path, signature)
id = nil
path = Path.new(path).relative_to(Config.current.library_path).to_s
MainFiber.run do
get_db do |db|
if is_title
id = db.query_one? "select id from titles where path = (?)", path,
# First attempt to find the matching title in DB using BOTH path
# and signature
id = db.query_one? "select id from titles where path = (?) and " \
"signature = (?)", path, signature.to_s, as: String
should_update = id.nil?
# If it fails, try to match using the path only. This could happen
# for example when a new entry is added to the title
id ||= db.query_one? "select id from titles where path = (?)", path,
as: String
else
id = db.query_one? "select id from ids where path = (?)", path,
# If it still fails, we will have to rely on the signature values.
# This could happen when the user moved or renamed the title, or
# a title containing the title
unless id
# If there are multiple rows with the same signature (this could
# happen simply by bad luck, or when the user copied a title),
# pick the row that has the most similar path to the give path
rows = [] of Tuple(String, String)
db.query "select id, path from titles where signature = (?)",
signature.to_s do |rs|
rs.each do
rows << {rs.read(String), rs.read(String)}
end
end
row = rows.max_by?(&.[1].components_similarity(path))
id = row[0] if row
end
# At this point, `id` would still be nil if there's no row matching
# either the path or the signature
# If we did identify a matching title, save the path and signature
# values back to the DB
if id && should_update
db.exec "update titles set path = (?), signature = (?) " \
"where id = (?)", path, signature.to_s, id
end
end
end
id
end
# See the comments in `#get_title_id` to see how this method works.
def get_entry_id(path, signature)
id = nil
path = Path.new(path).relative_to(Config.current.library_path).to_s
MainFiber.run do
get_db do |db|
id = db.query_one? "select id from ids where path = (?) and " \
"signature = (?)", path, signature.to_s, as: String
should_update = id.nil?
id ||= db.query_one? "select id from ids where path = (?)", path,
as: String
unless id
rows = [] of Tuple(String, String)
db.query "select id, path from ids where signature = (?)",
signature.to_s do |rs|
rs.each do
rows << {rs.read(String), rs.read(String)}
end
end
row = rows.max_by?(&.[1].components_similarity(path))
id = row[0] if row
end
if id && should_update
db.exec "update ids set path = (?), signature = (?) " \
"where id = (?)", path, signature.to_s, id
end
end
end
@ -256,11 +324,14 @@ class Storage
db.transaction do |tran|
conn = tran.connection
@@insert_ids.each do |tp|
if tp[:is_title]
conn.exec "insert into titles values (?, ?, null)", tp[:id],
tp[:path]
path = Path.new(tp[:path])
.relative_to(Config.current.library_path).to_s
if tp[:title_signature]
conn.exec "insert into titles values (?, ?, ?)", tp[:id],
path, tp[:title_signature].to_s
else
conn.exec "insert into ids values (?, ?)", tp[:path], tp[:id]
conn.exec "insert into ids values (?, ?, ?)", path, tp[:id],
tp[:entry_signature].to_s
end
end
end
@ -363,7 +434,8 @@ class Storage
db.query "select path, id from ids" do |rs|
rs.each do
path = rs.read String
trash_ids << rs.read String unless File.exists? path
fullpath = Path.new(path).expand(Config.current.library_path).to_s
trash_ids << rs.read String unless File.exists? fullpath
end
end
@ -377,7 +449,8 @@ class Storage
db.query "select path, id from titles" do |rs|
rs.each do
path = rs.read String
trash_titles << rs.read String unless Dir.exists? path
fullpath = Path.new(path).expand(Config.current.library_path).to_s
trash_titles << rs.read String unless Dir.exists? fullpath
end
end

50
src/util/signature.cr Normal file
View File

@ -0,0 +1,50 @@
class File
abstract struct Info
def inode
@stat.st_ino
end
end
# Returns the signature of the file at filename.
# When it is not a supported file, returns 0. Otherwise, calculate the
# signature by combining its inode value, file size and mtime. This
# ensures that moving (unless to another device) and renaming the file
# preserves the signature, while copying or editing the file changes it.
def self.signature(filename) : UInt64
return 0u64 unless %w(.zip .rar .cbz .cbr).includes? File.extname filename
info = File.info filename
signatures = [
info.inode,
File.size(filename),
info.modification_time.to_unix,
]
Digest::CRC32.checksum(signatures.sort.join).to_u64
end
end
class Dir
# Returns the signature of the directory at dirname.
# The signature is calculated by combining its mtime and the signatures of
# all directories and files in it. This ensures that moving (unless to
# another device) and renaming the directory preserves the signature,
# while copying or editing its content changes it.
def self.signature(dirname) : UInt64
signatures = [] of (UInt64 | Int64)
signatures << File.info(dirname).modification_time.to_unix
self.open dirname do |dir|
dir.entries.each do |fn|
next if fn.starts_with? "."
path = File.join dirname, fn
if File.directory? path
signatures << Dir.signature path
else
_sig = File.signature path
# Only add its signature value to `signatures` when it is a
# supported file
signatures << _sig if _sig > 0
end
end
end
Digest::CRC32.checksum(signatures.sort.join).to_u64
end
end

View File

@ -92,3 +92,18 @@ def sort_titles(titles : Array(Title), opt : SortOptions, username : String)
ary
end
class String
# Returns the similarity (in [0, 1]) of two paths.
# For the two paths, separate them into arrays of components, count the
# number of matching components backwards, and divide the count by the
# number of components of the shorter path.
def components_similarity(other : String) : Float64
s, l = [self, other]
.map { |str| Path.new(str).parts }
.sort_by &.size
match = s.reverse.zip(l.reverse).count { |a, b| a == b }
match / s.size
end
end