diff --git a/migration/ids_signature.7.cr b/migration/ids_signature.7.cr new file mode 100644 index 0000000..48da8e7 --- /dev/null +++ b/migration/ids_signature.7.cr @@ -0,0 +1,50 @@ +class IDSignature < MG::Base + def up : String + <<-SQL + ALTER TABLE ids ADD COLUMN signature TEXT; + SQL + end + + def down : String + <<-SQL + -- remove signature column from ids + ALTER TABLE ids RENAME TO tmp; + + CREATE TABLE ids ( + path TEXT NOT NULL, + id TEXT NOT NULL + ); + + INSERT INTO ids + SELECT path, id + FROM tmp; + + DROP TABLE tmp; + + -- recreate the indices + CREATE UNIQUE INDEX path_idx ON ids (path); + CREATE UNIQUE INDEX id_idx ON ids (id); + + -- recreate the foreign key constraint on thumbnails + ALTER TABLE thumbnails RENAME TO tmp; + + CREATE TABLE thumbnails ( + id TEXT NOT NULL, + data BLOB NOT NULL, + filename TEXT NOT NULL, + mime TEXT NOT NULL, + size INTEGER NOT NULL, + FOREIGN KEY (id) REFERENCES ids (id) + ON UPDATE CASCADE + ON DELETE CASCADE + ); + + INSERT INTO thumbnails + SELECT * FROM tmp; + + DROP TABLE tmp; + + CREATE UNIQUE INDEX tn_index ON thumbnails (id); + SQL + end +end diff --git a/migration/relative_path.8.cr b/migration/relative_path.8.cr new file mode 100644 index 0000000..cf22080 --- /dev/null +++ b/migration/relative_path.8.cr @@ -0,0 +1,31 @@ +class RelativePath < MG::Base + def up : String + base = Config.current.library_path + base = base[...-1] if base.ends_with? "/" + + <<-SQL + -- update the path column in ids to relative paths + UPDATE ids + SET path = REPLACE(path, '#{base}', ''); + + -- update the path column in titles to relative paths + UPDATE titles + SET path = REPLACE(path, '#{base}', ''); + SQL + end + + def down : String + base = Config.current.library_path + base = base[...-1] if base.ends_with? "/" + + <<-SQL + -- update the path column in ids to absolute paths + UPDATE ids + SET path = '#{base}' || path; + + -- update the path column in titles to absolute paths + UPDATE titles + SET path = '#{base}' || path; + SQL + end +end diff --git a/src/library/entry.cr b/src/library/entry.cr index 3e270cf..e3599a8 100644 --- a/src/library/entry.cr +++ b/src/library/entry.cr @@ -11,13 +11,14 @@ class Entry @title = File.basename @zip_path, File.extname @zip_path @encoded_title = URI.encode @title @size = (File.size @zip_path).humanize_bytes - id = storage.get_id @zip_path, false + id = storage.get_entry_id @zip_path, File.signature(@zip_path) if id.nil? id = random_str storage.insert_id({ - path: @zip_path, - id: id, - is_title: false, + path: @zip_path, + id: id, + title_signature: nil, + entry_signature: File.signature(@zip_path).to_s, }) end @id = id diff --git a/src/library/library.cr b/src/library/library.cr index a3db640..1dc426a 100644 --- a/src/library/library.cr +++ b/src/library/library.cr @@ -42,16 +42,6 @@ class Library end end end - - db_interval = Config.current.db_optimization_interval_hours - unless db_interval < 1 - spawn do - loop do - Storage.default.optimize - sleep db_interval.hours - end - end - end end def titles @@ -119,6 +109,7 @@ class Library storage.close Logger.debug "Scan completed" + Storage.default.optimize end def get_continue_reading_entries(username) diff --git a/src/library/title.cr b/src/library/title.cr index 024f532..69441c9 100644 --- a/src/library/title.cr +++ b/src/library/title.cr @@ -3,19 +3,21 @@ require "../archive" class Title getter dir : String, parent_id : String, title_ids : Array(String), entries : Array(Entry), title : String, id : String, - encoded_title : String, mtime : Time, signature : UInt64 = 0 + encoded_title : String, mtime : Time, signature : UInt64 @entry_display_name_cache : Hash(String, String)? def initialize(@dir : String, @parent_id) storage = Storage.default - id = storage.get_id @dir, true + @signature = Dir.signature dir + id = storage.get_title_id dir, signature if id.nil? id = random_str storage.insert_id({ - path: @dir, - id: id, - is_title: true, + path: dir, + id: id, + title_signature: signature.to_s, + entry_signature: nil, }) end @id = id @@ -25,8 +27,6 @@ class Title @entries = [] of Entry @mtime = File.info(dir).modification_time - signatures = [] of UInt64 - Dir.entries(dir).each do |fn| next if fn.starts_with? "." path = File.join dir, fn @@ -35,18 +35,14 @@ class Title next if title.entries.size == 0 && title.titles.size == 0 Library.default.title_hash[title.id] = title @title_ids << title.id - signatures << title.signature next end if [".zip", ".cbz", ".rar", ".cbr"].includes? File.extname path entry = Entry.new path, self @entries << entry if entry.pages > 0 || entry.err_msg - signatures << File.size entry.zip_path end end - @signature = Digest::CRC32.checksum(signatures.sort.join "").to_u64 - mtimes = [@mtime] mtimes += @title_ids.map { |e| Library.default.title_hash[e].mtime } mtimes += @entries.map { |e| e.mtime } diff --git a/src/storage.cr b/src/storage.cr index dcc337c..d437387 100644 --- a/src/storage.cr +++ b/src/storage.cr @@ -20,9 +20,11 @@ class Storage @path : String @db : DB::Database? - alias IDTuple = NamedTuple(path: String, + alias IDTuple = NamedTuple( + path: String, id: String, - is_title: Bool) + entry_signature: String?, + title_signature: String?) use_default @@ -230,16 +232,82 @@ class Storage end end - def get_id(path, is_title) + def get_title_id(path, signature) id = nil + path = Path.new(path).relative_to(Config.current.library_path).to_s MainFiber.run do get_db do |db| - if is_title - id = db.query_one? "select id from titles where path = (?)", path, - as: String - else - id = db.query_one? "select id from ids where path = (?)", path, - as: String + # First attempt to find the matching title in DB using BOTH path + # and signature + id = db.query_one? "select id from titles where path = (?) and " \ + "signature = (?)", path, signature.to_s, as: String + + should_update = id.nil? + # If it fails, try to match using the path only. This could happen + # for example when a new entry is added to the title + id ||= db.query_one? "select id from titles where path = (?)", path, + as: String + + # If it still fails, we will have to rely on the signature values. + # This could happen when the user moved or renamed the title, or + # a title containing the title + unless id + # If there are multiple rows with the same signature (this could + # happen simply by bad luck, or when the user copied a title), + # pick the row that has the most similar path to the give path + rows = [] of Tuple(String, String) + db.query "select id, path from titles where signature = (?)", + signature.to_s do |rs| + rs.each do + rows << {rs.read(String), rs.read(String)} + end + end + row = rows.max_by?(&.[1].components_similarity(path)) + id = row[0] if row + end + + # At this point, `id` would still be nil if there's no row matching + # either the path or the signature + + # If we did identify a matching title, save the path and signature + # values back to the DB + if id && should_update + db.exec "update titles set path = (?), signature = (?) " \ + "where id = (?)", path, signature.to_s, id + end + end + end + id + end + + # See the comments in `#get_title_id` to see how this method works. + def get_entry_id(path, signature) + id = nil + path = Path.new(path).relative_to(Config.current.library_path).to_s + MainFiber.run do + get_db do |db| + id = db.query_one? "select id from ids where path = (?) and " \ + "signature = (?)", path, signature.to_s, as: String + + should_update = id.nil? + id ||= db.query_one? "select id from ids where path = (?)", path, + as: String + + unless id + rows = [] of Tuple(String, String) + db.query "select id, path from ids where signature = (?)", + signature.to_s do |rs| + rs.each do + rows << {rs.read(String), rs.read(String)} + end + end + row = rows.max_by?(&.[1].components_similarity(path)) + id = row[0] if row + end + + if id && should_update + db.exec "update ids set path = (?), signature = (?) " \ + "where id = (?)", path, signature.to_s, id end end end @@ -256,11 +324,14 @@ class Storage db.transaction do |tran| conn = tran.connection @@insert_ids.each do |tp| - if tp[:is_title] - conn.exec "insert into titles values (?, ?, null)", tp[:id], - tp[:path] + path = Path.new(tp[:path]) + .relative_to(Config.current.library_path).to_s + if tp[:title_signature] + conn.exec "insert into titles values (?, ?, ?)", tp[:id], + path, tp[:title_signature].to_s else - conn.exec "insert into ids values (?, ?)", tp[:path], tp[:id] + conn.exec "insert into ids values (?, ?, ?)", path, tp[:id], + tp[:entry_signature].to_s end end end @@ -363,7 +434,8 @@ class Storage db.query "select path, id from ids" do |rs| rs.each do path = rs.read String - trash_ids << rs.read String unless File.exists? path + fullpath = Path.new(path).expand(Config.current.library_path).to_s + trash_ids << rs.read String unless File.exists? fullpath end end @@ -377,7 +449,8 @@ class Storage db.query "select path, id from titles" do |rs| rs.each do path = rs.read String - trash_titles << rs.read String unless Dir.exists? path + fullpath = Path.new(path).expand(Config.current.library_path).to_s + trash_titles << rs.read String unless Dir.exists? fullpath end end diff --git a/src/util/signature.cr b/src/util/signature.cr new file mode 100644 index 0000000..0db6b21 --- /dev/null +++ b/src/util/signature.cr @@ -0,0 +1,50 @@ +class File + abstract struct Info + def inode + @stat.st_ino + end + end + + # Returns the signature of the file at filename. + # When it is not a supported file, returns 0. Otherwise, calculate the + # signature by combining its inode value, file size and mtime. This + # ensures that moving (unless to another device) and renaming the file + # preserves the signature, while copying or editing the file changes it. + def self.signature(filename) : UInt64 + return 0u64 unless %w(.zip .rar .cbz .cbr).includes? File.extname filename + info = File.info filename + signatures = [ + info.inode, + File.size(filename), + info.modification_time.to_unix, + ] + Digest::CRC32.checksum(signatures.sort.join).to_u64 + end +end + +class Dir + # Returns the signature of the directory at dirname. + # The signature is calculated by combining its mtime and the signatures of + # all directories and files in it. This ensures that moving (unless to + # another device) and renaming the directory preserves the signature, + # while copying or editing its content changes it. + def self.signature(dirname) : UInt64 + signatures = [] of (UInt64 | Int64) + signatures << File.info(dirname).modification_time.to_unix + self.open dirname do |dir| + dir.entries.each do |fn| + next if fn.starts_with? "." + path = File.join dirname, fn + if File.directory? path + signatures << Dir.signature path + else + _sig = File.signature path + # Only add its signature value to `signatures` when it is a + # supported file + signatures << _sig if _sig > 0 + end + end + end + Digest::CRC32.checksum(signatures.sort.join).to_u64 + end +end diff --git a/src/util/util.cr b/src/util/util.cr index d7c0412..f174c39 100644 --- a/src/util/util.cr +++ b/src/util/util.cr @@ -92,3 +92,18 @@ def sort_titles(titles : Array(Title), opt : SortOptions, username : String) ary end + +class String + # Returns the similarity (in [0, 1]) of two paths. + # For the two paths, separate them into arrays of components, count the + # number of matching components backwards, and divide the count by the + # number of components of the shorter path. + def components_similarity(other : String) : Float64 + s, l = [self, other] + .map { |str| Path.new(str).parts } + .sort_by &.size + + match = s.reverse.zip(l.reverse).count { |a, b| a == b } + match / s.size + end +end