Merge pull request #229 from Leeingnyo/feature/preserve-scanned-titles

Reuse scanned titles on boot and scanning
This commit is contained in:
Alex Ling 2021-09-18 16:14:31 +08:00 committed by GitHub
commit 6e04e249e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 234 additions and 12 deletions

View File

@ -1,6 +1,9 @@
require "image_size"
require "yaml"
class Entry
include YAML::Serializable
getter zip_path : String, book : Title, title : String,
size : String, pages : Int32, id : String, encoded_path : String,
encoded_title : String, mtime : Time, err_msg : String?

View File

@ -1,12 +1,41 @@
class Library
include YAML::Serializable
getter dir : String, title_ids : Array(String),
title_hash : Hash(String, Title)
use_default
def initialize
register_mime_types
def save_instance
path = Config.current.library_path
instance_file_path = File.join path, "library.yml.gz"
Logger.debug "Caching library to #{instance_file_path}"
writer = Compress::Gzip::Writer.new instance_file_path,
Compress::Gzip::BEST_COMPRESSION
writer.write self.to_yaml.to_slice
writer.close
end
def self.load_instance
dir = Config.current.library_path
return unless Dir.exists? dir
instance_file_path = File.join dir, "library.yml.gz"
return unless File.exists? instance_file_path
Logger.debug "Loading cached library from #{instance_file_path}"
begin
Compress::Gzip::Reader.open instance_file_path do |content|
@@default = Library.from_yaml content
end
Library.default.register_jobs
rescue e
Logger.error e
end
end
def initialize
@dir = Config.current.library_path
# explicitly initialize @titles to bypass the compiler check. it will
# be filled with actual Titles in the `scan` call below
@ -16,6 +45,12 @@ class Library
@entries_count = 0
@thumbnails_count = 0
register_jobs
end
protected def register_jobs
register_mime_types
scan_interval = Config.current.scan_interval_minutes
if scan_interval < 1
scan
@ -25,7 +60,7 @@ class Library
start = Time.local
scan
ms = (Time.local - start).total_milliseconds
Logger.info "Scanned #{@title_ids.size} titles in #{ms}ms"
Logger.debug "Library initialized in #{ms}ms"
sleep scan_interval.minutes
end
end
@ -89,6 +124,7 @@ class Library
end
def scan
start = Time.local
unless Dir.exists? @dir
Logger.info "The library directory #{@dir} does not exist. " \
"Attempting to create it"
@ -97,14 +133,36 @@ class Library
storage = Storage.new auto_close: false
examine_context : ExamineContext = {
cached_contents_signature: {} of String => String,
deleted_title_ids: [] of String,
deleted_entry_ids: [] of String,
}
@title_ids.select! do |title_id|
title = @title_hash[title_id]
existence = title.examine examine_context
unless existence
examine_context["deleted_title_ids"].concat [title_id] +
title.deep_titles.map &.id
examine_context["deleted_entry_ids"].concat title.deep_entries.map &.id
end
existence
end
remained_title_dirs = @title_ids.map { |id| title_hash[id].dir }
examine_context["deleted_title_ids"].each do |title_id|
@title_hash.delete title_id
end
cache = examine_context["cached_contents_signature"]
(Dir.entries @dir)
.select { |fn| !fn.starts_with? "." }
.map { |fn| File.join @dir, fn }
.select { |path| !(remained_title_dirs.includes? path) }
.select { |path| File.directory? path }
.map { |path| Title.new path, "" }
.map { |path| Title.new path, "", cache }
.select { |title| !(title.entries.empty? && title.titles.empty?) }
.sort! { |a, b| a.title <=> b.title }
.tap { |_| @title_ids.clear }
.each do |title|
@title_hash[title.id] = title
@title_ids << title.id
@ -113,8 +171,15 @@ class Library
storage.bulk_insert_ids
storage.close
Logger.debug "Scan completed"
Storage.default.mark_unavailable
ms = (Time.local - start).total_milliseconds
Logger.info "Scanned #{@title_ids.size} titles in #{ms}ms"
Storage.default.mark_unavailable examine_context["deleted_entry_ids"],
examine_context["deleted_title_ids"]
spawn do
save_instance
end
end
def get_continue_reading_entries(username)

View File

@ -2,18 +2,24 @@ require "digest"
require "../archive"
class Title
include YAML::Serializable
getter dir : String, parent_id : String, title_ids : Array(String),
entries : Array(Entry), title : String, id : String,
encoded_title : String, mtime : Time, signature : UInt64,
entry_cover_url_cache : Hash(String, String)?
setter entry_cover_url_cache : Hash(String, String)?
@[YAML::Field(ignore: true)]
@entry_display_name_cache : Hash(String, String)?
@[YAML::Field(ignore: true)]
@entry_cover_url_cache : Hash(String, String)?
@[YAML::Field(ignore: true)]
@cached_display_name : String?
@[YAML::Field(ignore: true)]
@cached_cover_url : String?
def initialize(@dir : String, @parent_id)
def initialize(@dir : String, @parent_id, cache = {} of String => String)
storage = Storage.default
@signature = Dir.signature dir
id = storage.get_title_id dir, signature
@ -26,6 +32,7 @@ class Title
})
end
@id = id
@contents_signature = Dir.contents_signature dir, cache
@title = File.basename dir
@encoded_title = URI.encode @title
@title_ids = [] of String
@ -36,7 +43,7 @@ class Title
next if fn.starts_with? "."
path = File.join dir, fn
if File.directory? path
title = Title.new path, @id
title = Title.new path, @id, cache
next if title.entries.size == 0 && title.titles.size == 0
Library.default.title_hash[title.id] = title
@title_ids << title.id
@ -63,6 +70,106 @@ class Title
end
end
# Utility method used in library rescanning.
# - When the title does not exist on the file system anymore, return false
# and let it be deleted from the library instance
# - When the title exists, but its contents signature is now different from
# the cache, it means some of its content (nested titles or entries)
# has been added, deleted, or renamed. In this case we update its
# contents signature and instance variables
# - When the title exists and its contents signature is still the same, we
# return true so it can be reused without rescanning
def examine(context : ExamineContext) : Bool
return false unless Dir.exists? @dir
contents_signature = Dir.contents_signature @dir,
context["cached_contents_signature"]
return true if @contents_signature == contents_signature
@contents_signature = contents_signature
@signature = Dir.signature @dir
storage = Storage.default
id = storage.get_title_id dir, signature
if id.nil?
id = random_str
storage.insert_title_id({
path: dir,
id: id,
signature: signature.to_s,
})
end
@id = id
@mtime = File.info(@dir).modification_time
previous_titles_size = @title_ids.size
@title_ids.select! do |title_id|
title = Library.default.get_title! title_id
existence = title.examine context
unless existence
context["deleted_title_ids"].concat [title_id] +
title.deep_titles.map &.id
context["deleted_entry_ids"].concat title.deep_entries.map &.id
end
existence
end
remained_title_dirs = @title_ids.map do |title_id|
title = Library.default.get_title! title_id
title.dir
end
previous_entries_size = @entries.size
@entries.select! do |entry|
existence = File.exists? entry.zip_path
Fiber.yield
context["deleted_entry_ids"] << entry.id unless existence
existence
end
remained_entry_zip_paths = @entries.map &.zip_path
is_titles_added = false
is_entries_added = false
Dir.entries(dir).each do |fn|
next if fn.starts_with? "."
path = File.join dir, fn
if File.directory? path
next if remained_title_dirs.includes? path
title = Title.new path, @id, context["cached_contents_signature"]
next if title.entries.size == 0 && title.titles.size == 0
Library.default.title_hash[title.id] = title
@title_ids << title.id
is_titles_added = true
next
end
if is_supported_file path
next if remained_entry_zip_paths.includes? path
entry = Entry.new path, self
if entry.pages > 0 || entry.err_msg
@entries << entry
is_entries_added = true
end
end
end
mtimes = [@mtime]
mtimes += @title_ids.map { |e| Library.default.title_hash[e].mtime }
mtimes += @entries.map &.mtime
@mtime = mtimes.max
if is_titles_added || previous_titles_size != @title_ids.size
@title_ids.sort! do |a, b|
compare_numerically Library.default.title_hash[a].title,
Library.default.title_hash[b].title
end
end
if is_entries_added || previous_entries_size != @entries.size
sorter = ChapterSorter.new @entries.map &.title
@entries.sort! do |a, b|
sorter.compare a.title, b.title
end
end
true
end
alias SortContext = NamedTuple(username: String, opt: SortOptions)
def build_json(*, slim = false, shallow = false,

View File

@ -133,3 +133,8 @@ class TitleInfo
LRUCache.set generate_cache_entry key, self.to_json
end
end
alias ExamineContext = NamedTuple(
cached_contents_signature: Hash(String, String),
deleted_title_ids: Array(String),
deleted_entry_ids: Array(String))

View File

@ -58,6 +58,7 @@ class CLI < Clim
LRUCache.init
Storage.default
Queue.default
Library.load_instance
Library.default
Plugin::Downloader.default

View File

@ -428,12 +428,21 @@ class Storage
end
end
def mark_unavailable
# Mark titles and entries that no longer exist on the file system as
# unavailable. By supplying `id_candidates` and `titles_candidates`, it
# only checks the existence of the candidate titles/entries to speed up
# the process.
def mark_unavailable(ids_candidates : Array(String)?,
titles_candidates : Array(String)?)
MainFiber.run do
get_db do |db|
# Detect dangling entry IDs
trash_ids = [] of String
db.query "select path, id from ids where unavailable = 0" do |rs|
query = "select path, id from ids where unavailable = 0"
unless ids_candidates.nil?
query += " and id in (#{ids_candidates.join "," { |i| "'#{i}'" }})"
end
db.query query do |rs|
rs.each do
path = rs.read String
fullpath = Path.new(path).expand(Config.current.library_path).to_s
@ -449,7 +458,11 @@ class Storage
# Detect dangling title IDs
trash_titles = [] of String
db.query "select path, id from titles where unavailable = 0" do |rs|
query = "select path, id from titles where unavailable = 0"
unless titles_candidates.nil?
query += " and id in (#{titles_candidates.join "," { |i| "'#{i}'" }})"
end
db.query query do |rs|
rs.each do
path = rs.read String
fullpath = Path.new(path).expand(Config.current.library_path).to_s

View File

@ -48,4 +48,32 @@ class Dir
end
Digest::CRC32.checksum(signatures.sort.join).to_u64
end
# Returns the contents signature of the directory at dirname for checking
# to rescan.
# Rescan conditions:
# - When a file added, moved, removed, renamed (including which in nested
# directories)
def self.contents_signature(dirname, cache = {} of String => String) : String
return cache[dirname] if cache[dirname]?
Fiber.yield
signatures = [] of String
self.open dirname do |dir|
dir.entries.sort.each do |fn|
next if fn.starts_with? "."
path = File.join dirname, fn
if File.directory? path
signatures << Dir.contents_signature path, cache
else
# Only add its signature value to `signatures` when it is a
# supported file
signatures << fn if is_supported_file fn
end
Fiber.yield
end
end
hash = Digest::SHA1.hexdigest(signatures.join)
cache[dirname] = hash
hash
end
end