From 2cac45bb309d604049509d8f39db90e294199123 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Thu, 28 May 2026 16:47:32 -0400 Subject: [PATCH 1/4] ETT-1459: record item in feed_audit in collate * add first ingest date column to feed_audit table * record item in feed_audit at the end of collate * remove record_audit functionality from LocalPairtree (now unused except in development); emit warning (could record to feed_storage for consistency if we want instead, but we aren't really using it..?) * testing with storage classes in collate is a bit messy because of the distinction between depositing to the repo and reading back from the repo * add additional logging options in Stage (need to DRY out though) * additional logging for collate (should log duration; see ETT-824) * add some notes towards ETT-1687 * Mock depositing item for collate tests with mocked storage --- etc/config_test.yml | 13 ++-- etc/ingest.sql | 4 +- lib/HTFeed/Stage.pm | 21 +++++- lib/HTFeed/Stage/Collate.pm | 75 +++++++++++++++++- lib/HTFeed/Storage/LinkedPairtree.pm | 5 ++ lib/HTFeed/Storage/LocalPairtree.pm | 47 +----------- t/collate.t | 109 ++++++++++++++++++++------- t/lib/HTFeed/Test/TempDirs.pm | 5 +- t/storage_local_pairtree.t | 17 ----- t/storage_pairtree_object_store.t | 2 + 10 files changed, 191 insertions(+), 107 deletions(-) diff --git a/etc/config_test.yml b/etc/config_test.yml index 56faa46d..9704859e 100644 --- a/etc/config_test.yml +++ b/etc/config_test.yml @@ -7,17 +7,16 @@ test_staging: test_fixtures: /usr/local/feed/etc/feed_fixtures.sql repository: - # The directory in which symbolic links will be created to each volume - link_dir: /tmp/obj_link - # The directory into which volumes will be loaded + # Directory through which volumes in the repository should be accessed + link_dir: /tmp/obj + # Paths for deposit -- deprecated -- configure deposit locations through + # storage classes instead obj_dir: /tmp/obj backup_obj_dir: /tmp/obj_backup storage_classes: - linkedpairtree-test: - class: HTFeed::Storage::LinkedPairtree - # The directory in which symbolic links will be created to each volume - link_dir: /tmp/obj_link + pairtree-test: + class: HTFeed::Storage::LocalPairtree # The directory into which volumes will be loaded obj_dir: /tmp/obj diff --git a/etc/ingest.sql b/etc/ingest.sql index fe1a9d4b..1fbb619a 100644 --- a/etc/ingest.sql +++ b/etc/ingest.sql @@ -5,6 +5,7 @@ CREATE TABLE IF NOT EXISTS `feed_audit` ( `id` varchar(30) NOT NULL, `sdr_partition` tinyint(4) DEFAULT NULL, `zip_size` bigint(20) DEFAULT NULL, + `first_ingest_date` datetime NULL DEFAULT CURRENT_TIMESTAMP, `image_size` bigint(20) DEFAULT NULL, `zip_date` datetime DEFAULT NULL, `mets_size` bigint(20) DEFAULT NULL, @@ -15,7 +16,8 @@ CREATE TABLE IF NOT EXISTS `feed_audit` ( `md5check_ok` tinyint(1) DEFAULT NULL, `is_tombstoned` tinyint(1) DEFAULT NULL, PRIMARY KEY (`namespace`,`id`), - KEY `feed_audit_zip_date_idx` (`zip_date`) + KEY `feed_audit_zip_date_idx` (`zip_date`), + KEY `feed_audit_first_ingest_date_idx` (`first_ingest_date`) ); CREATE TABLE IF NOT EXISTS `feed_queue_disallow` ( diff --git a/lib/HTFeed/Stage.pm b/lib/HTFeed/Stage.pm index 7c610fba..7be10e19 100644 --- a/lib/HTFeed/Stage.pm +++ b/lib/HTFeed/Stage.pm @@ -80,6 +80,8 @@ sub failed { return 1; } + +# FIXME deduplicate sub set_error { my $self = shift; my $error = shift; @@ -100,14 +102,27 @@ sub set_error { } } -sub set_info { +sub log_warn { + my $self = shift; + my $message = shift; + + my $logger = get_logger( ref($self) ); + $logger->warn( + $message, + namespace => $self->{volume}->get_namespace(), + objid => $self->{volume}->get_objid(), + stage => ref($self), + @_ + ); +} + +sub log_info { my $self = shift; my $message = shift; my $logger = get_logger( ref($self) ); $logger->info( - 'Info', - detail => $message, + $message, namespace => $self->{volume}->get_namespace(), objid => $self->{volume}->get_objid(), stage => ref($self), diff --git a/lib/HTFeed/Stage/Collate.pm b/lib/HTFeed/Stage/Collate.pm index d289618a..9b76e278 100644 --- a/lib/HTFeed/Stage/Collate.pm +++ b/lib/HTFeed/Stage/Collate.pm @@ -13,6 +13,8 @@ use HTFeed::Storage::PairtreeObjectStore; use HTFeed::Storage::ObjectStore; use HTFeed::Storage::PrefixedVersions; use Log::Log4perl qw(get_logger); +use POSIX qw(strftime); +use HTFeed::DBTools qw(get_dbh); =head1 NAME @@ -38,18 +40,33 @@ sub run{ @storages = $self->storages unless @storages; foreach my $storage (@storages) { + my $rolled_back = 0; + if ($self->collate($storage)) { - get_logger->trace("finished collate to $storage, cleaning up"); + # TODO log how long it took + $self->log_info("finished collate to $storage->{name}, cleaning up"); $storage->cleanup } else { - get_logger->warn("collate to $storage failed, rolling back"); + $self->log_warn("collate to $storage->{name} failed, rolling back"); $storage->rollback; + $rolled_back = 1; } $storage->clean_staging(); $self->check_errors($storage); + + # If collate returned false but didn't raise an error, we still need to + # record that the stage failed + if($rolled_back) { + $self->set_error("OperationFailed",operation => "collate", detail => "collate to $storage->{name} failed; rolled back"); + } + + # TODO wait until *all* storages have succeeded to clean up *any* + # storages; roll back any that have failed + $self->log_repeat($storage); } + $self->record_audit() if !$self->{failed}; $self->_set_done(); $self->{job_metrics}->inc("ingest_collate_items_total"); @@ -64,7 +81,8 @@ sub log_repeat { if (-e $volume->get_zip_path() && -e $volume->get_mets_path()) { $self->{is_repeat} = 1; - $self->set_info('Collating volume that is already in repo'); + # deprecated format + $self->log_info('Collating volume that is already in repo'); } } @@ -73,7 +91,7 @@ sub collate { my $self = shift; my $storage = shift; - get_logger->trace("Starting collate for $storage"); + $self->log_info("Starting collate for $storage->{name}"); $storage->validate_zip_completeness && $storage->encrypt && @@ -125,4 +143,53 @@ sub clean_success { $self->{volume}->clean_sip_success(); } + +sub file_date { + my $self = shift; + my $file = shift; + + if (-e $file) { + my $seconds = (stat($file))[9]; + return strftime("%Y-%m-%d %H:%M:%S", localtime($seconds)); + } +} + +# updates the zip_date in the feed_audit table to the current timestamp for +# this zip in the repository +# +# first_ingest_date is set by default to CURRENT_TIMESTAMP on first insert +sub record_audit { + my $self = shift; + + my $stmt = + "insert into feed_audit (namespace, id, zip_size, zip_date, mets_size, mets_date) \ + values(?,?,?,?,?,?) \ + ON DUPLICATE KEY UPDATE zip_size=?, zip_date =?,mets_size=?,mets_date=?"; + + my $volume = $self->{volume}; + + my $repo_path = $volume->get_repository_symlink(); + my $zip_path = $volume->get_repository_zip_path; + die("Zip missing (in $repo_path) after collate") unless $zip_path and -e $zip_path; + + my $mets_path = $volume->get_repository_mets_path; + die("METS missing (in $repo_path) after collate") unless $mets_path and -e $mets_path; + + my $zipsize = -s $zip_path; + my $zipdate = $self->file_date($zip_path); + my $metssize = -s $mets_path; + my $metsdate = $self->file_date($mets_path); + my $sth = get_dbh()->prepare($stmt); + get_logger()->trace("feed_audit: $zip_path / $zipdate / $zipsize bytes"); + get_logger()->trace("feed_audit: $mets_path / $metsdate / $metssize bytes"); + my $res = $sth->execute( + $volume->{namespace}, $volume->{objid}, + $zipsize, $zipdate, $metssize, $metsdate, + # duplicate parameters for duplicate key update + $zipsize, $zipdate, $metssize, $metsdate + ); + + return $res; +} + 1; diff --git a/lib/HTFeed/Storage/LinkedPairtree.pm b/lib/HTFeed/Storage/LinkedPairtree.pm index f5263a45..180317f8 100644 --- a/lib/HTFeed/Storage/LinkedPairtree.pm +++ b/lib/HTFeed/Storage/LinkedPairtree.pm @@ -104,4 +104,9 @@ sub symlink_if_needed { return 1; } +sub record_audit { + get_logger()->warn("LinkedPairtree is deprecated; not recording audit"); + return 1; +} + 1; diff --git a/lib/HTFeed/Storage/LocalPairtree.pm b/lib/HTFeed/Storage/LocalPairtree.pm index c8c36a9a..f7564a68 100644 --- a/lib/HTFeed/Storage/LocalPairtree.pm +++ b/lib/HTFeed/Storage/LocalPairtree.pm @@ -7,10 +7,8 @@ use base qw(HTFeed::Storage); use File::Pairtree qw(id2ppath s2ppchars); use File::Path qw(make_path); -use HTFeed::DBTools qw(get_dbh); use HTFeed::VolumeValidator; use Log::Log4perl qw(get_logger); -use POSIX qw(strftime); use URI::Escape; sub object_path { @@ -106,50 +104,9 @@ sub make_object_path { return 1; } -sub file_date { - my $self = shift; - my $file = shift; - - if (-e $file) { - my $seconds = (stat($file))[9]; - return strftime("%Y-%m-%d %H:%M:%S", localtime($seconds)); - } -} - -# updates the zip_date in the feed_audit table to the current timestamp for -# this zip in the repository sub record_audit { - my $self = shift; - - my $start_time = $self->{job_metrics}->time; - my $path = $self->object_path(); - my ($sdr_partition) = ($path =~ qr#/?sdr(\d+)/?#); - - my $stmt = - "insert into feed_audit (namespace, id, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked, lastmd5check, md5check_ok) \ - values(?,?,?,?,?,?,?,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,1) \ - ON DUPLICATE KEY UPDATE sdr_partition = ?, zip_size=?, zip_date =?,mets_size=?,mets_date=?,lastchecked = CURRENT_TIMESTAMP,lastmd5check = CURRENT_TIMESTAMP, md5check_ok = 1"; - - # TODO populate image_size, page_count - - my $zipsize = $self->zip_size; - my $zipdate = $self->file_date($self->zip_obj_path); - my $metssize = $self->mets_size; - my $metsdate = $self->file_date($self->mets_obj_path); - my $sth = get_dbh()->prepare($stmt); - my $res = $sth->execute( - $self->{namespace}, $self->{objid}, - $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, - # duplicate parameters for duplicate key update - $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate - ); - - my $end_time = $self->{job_metrics}->time; - my $delta_time = $end_time - $start_time; - $self->{job_metrics}->inc("ingest_record_audit_items_total"); - $self->{job_metrics}->add("ingest_record_audit_seconds_total", $delta_time); - - return $res; + get_logger()->warn("LocalPairtree for dev/testing purposes only; not recording audit"); + return 1; } 1; diff --git a/t/collate.t b/t/collate.t index a9c94e13..5f7365a9 100644 --- a/t/collate.t +++ b/t/collate.t @@ -2,6 +2,7 @@ use FindBin; use lib "$FindBin::Bin/lib"; use Test::Spec; +use File::Path qw(make_path); use HTFeed::Test::Support qw(load_db_fixtures); use HTFeed::Test::SpecSupport qw(stage_volume); use HTFeed::Config qw(set_config get_config); @@ -9,20 +10,38 @@ use HTFeed::DBTools qw(get_dbh); use Test::MockObject; describe "HTFeed::Collate" => sub { + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog); context "with mocked storage" => sub { my $storage; my $collate; - before each => sub { - $storage = Test::MockObject->new(); + sub mocked_storage { + my $storage = Test::MockObject->new(); $storage->set_true(qw(stage validate_zip_completeness prevalidate make_object_path move postvalidate record_audit cleanup rollback clean_staging encrypt verify_crypt)); + $storage->{name} = "mock_storage"; + + return $storage; + } + + before each => sub { + $storage = mocked_storage(); my $volume = HTFeed::Volume->new(namespace => 'test', - id => 'test', + objid => 'test', packagetype => 'simple'); $collate = HTFeed::Stage::Collate->new(volume => $volume); + # TODO remove repo config for obj dir vs. link dir + # need to have something here so record_audit will be happy + my $obj_path = $tmpdirs->{obj_dir} . "/test/pairtree_root/te/st/test"; + make_path($obj_path); + system("touch $obj_path/test.zip"); + system("touch $obj_path/test.mets.xml"); + + get_dbh()->do("DELETE FROM feed_audit WHERE namespace = 'test'"); + }; context "when zip contents validation fails" => sub { @@ -31,7 +50,7 @@ describe "HTFeed::Collate" => sub { }; it "doesn't move to staging area" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok(!$storage->called('stage')); }; @@ -43,14 +62,14 @@ describe "HTFeed::Collate" => sub { }; it "doesn't move to object storage" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok(!$storage->called('make_object_path')); ok(!$storage->called('move')); }; it "cleans up the staging area" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok($storage->called('clean_staging')); }; }; @@ -61,14 +80,22 @@ describe "HTFeed::Collate" => sub { }; it "calls rollback" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok($storage->called('rollback')); }; it "cleans up the staging area" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok($storage->called('clean_staging')); }; + + it "does not record to feed_audit" => sub { + eval { $collate->run($storage) }; + + my $r = get_dbh()->selectall_arrayref("SELECT first_ingest_date from feed_audit WHERE namespace = 'test' and id = 'test'"); + ok(scalar(@$r) == 0); + + } }; context "when postvalidation fails" => sub { @@ -77,23 +104,35 @@ describe "HTFeed::Collate" => sub { }; it "rolls back to the existing version" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok($storage->called('rollback')); }; it "does not record an audit" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok(!$storage->called('record_audit')); }; it "cleans up the staging area" => sub { - $collate->run($storage); + eval { $collate->run($storage) }; ok($storage->called('clean_staging')); }; }; + it "rolls back first storage if second storage fails" => sub { + my $storage1 = mocked_storage(); + $storage1->{name} = "storage1"; + my $storage2 = mocked_storage(); + $storage2->{name} = "storage2"; + $storage2->set_false('postvalidate'); + + eval { $collate->run($storage1,$storage2) }; + ok($storage1->called('rollback')); + ok($storage2->called('rollback')); + }; + context "when everything succeeds" => sub { it "encrypts the item" => sub { $collate->run($storage); @@ -128,7 +167,15 @@ describe "HTFeed::Collate" => sub { it "does not roll back" => sub { $collate->run($storage); ok(!$storage->called('rollback')); - } + }; + + it "records ingest date in feed_audit" => sub { + $collate->run($storage); + + my $r = get_dbh()->selectall_arrayref("SELECT first_ingest_date from feed_audit WHERE namespace = 'test' and id = 'test'"); + ok($r->[0][0]); + + }; }; @@ -138,7 +185,7 @@ describe "HTFeed::Collate" => sub { }; it "doesn't move to staging" => sub { - $collate->run($storage); + eval { $collate->run($storage); }; ok(!$storage->called('stage')); }; }; @@ -149,25 +196,26 @@ describe "HTFeed::Collate" => sub { }; it "doesn't move to staging" => sub { - $collate->run($storage); + eval { $collate->run($storage); }; ok(!$storage->called('stage')); }; }; }; context "with real volumes" => sub { - spec_helper 'storage_helper.pl'; - - local our ($tmpdirs, $testlog); - it "logs a repeat when collated twice" => sub { my $volume = stage_volume($tmpdirs,'test','test'); + my $storage = HTFeed::Storage::LocalPairtree->new( + volume => $volume, + config => { obj_dir => get_config('repository','obj_dir') }, + name => "localpairtree_test" + ); my $stage = HTFeed::Stage::Collate->new(volume => $volume); - $stage->run; + $stage->run($storage); # collate same thing again $stage = HTFeed::Stage::Collate->new(volume => $volume); - $stage->run; + $stage->run($storage); ok($testlog->matches(qw(INFO.*already in repo))); }; @@ -177,6 +225,7 @@ describe "HTFeed::Collate" => sub { local our ($bucket, $s3); my $old_storage_classes; + my $old_repository_dirs; my %s3s; before all => sub { @@ -198,14 +247,9 @@ describe "HTFeed::Collate" => sub { before each => sub { $old_storage_classes = get_config('storage_classes'); + $old_repository_dirs = get_config("repository"); + my $new_storage_classes = { - # simulating isilon - 'linkedpairtree-test' => - { - class => 'HTFeed::Storage::LinkedPairtree', - obj_dir => $tmpdirs->{obj_dir}, - link_dir => $tmpdirs->{link_dir} - }, # simulating truenas (site 1) 'pairtreeobjectstore-ptobj1' => { class => 'HTFeed::Storage::PairtreeObjectStore', @@ -234,11 +278,20 @@ describe "HTFeed::Collate" => sub { encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" } }; + + my $vgw_home = "$ENV{FEED_HOME}/var/vgw"; + my $bucket_dir = "$vgw_home/$s3s{ptobj1}->{bucket}"; + set_config($new_storage_classes,'storage_classes'); + set_config({ + obj_dir => $bucket_dir, + link_dir => $bucket_dir + }, "repository"); }; after each => sub { set_config($old_storage_classes,'storage_classes'); + set_config($old_repository_dirs,'repository'); }; it "copies and records to all configured storages" => sub { @@ -258,8 +311,6 @@ describe "HTFeed::Collate" => sub { my $timestamp = $versioned_backup->[0][0]; my $pt_path = "test/pairtree_root/te/st/test"; - ok(-e "$tmpdirs->{obj_dir}/$pt_path/test.mets.xml",'copies mets to local storage'); - ok(-e "$tmpdirs->{obj_dir}/$pt_path/test.zip",'copies zip to local storage'); ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.zip.gpg","copies the encrypted zip to backup storage"); ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.mets.xml","copies the mets backup storage"); diff --git a/t/lib/HTFeed/Test/TempDirs.pm b/t/lib/HTFeed/Test/TempDirs.pm index 519d0765..cb8c983a 100644 --- a/t/lib/HTFeed/Test/TempDirs.pm +++ b/t/lib/HTFeed/Test/TempDirs.pm @@ -36,7 +36,7 @@ sub staging_dirtypes { sub repo_dirtypes { my $self = shift; - return qw(link_dir obj_dir other_obj_dir backup_obj_dir); + return qw(obj_dir other_obj_dir backup_obj_dir); } sub cleanup { @@ -57,6 +57,9 @@ sub setup_example { $self->{$dirtype} = $self->dir_for($dirtype); set_config($self->{$dirtype},'repository',$dirtype); } + + # We no longer create symlinks, so obj_dir and link_dir should be the same. + set_config($self->{obj_dir},'repository','link_dir'); } sub dir_for { diff --git a/t/storage_local_pairtree.t b/t/storage_local_pairtree.t index e2174c8a..389257e0 100644 --- a/t/storage_local_pairtree.t +++ b/t/storage_local_pairtree.t @@ -244,23 +244,6 @@ describe "HTFeed::Storage::LocalPairtree" => sub { }; - describe "#record_audit" => sub { - it "records an md5 check in the feed_audit table" => sub { - my $dbh = get_dbh(); - - my $storage = local_storage('test','test'); - $storage->stage; - $storage->make_object_path; - $storage->move; - $storage->record_audit; - - my $r = $dbh->selectall_arrayref("SELECT lastmd5check from feed_audit WHERE namespace = 'test' and id = 'test'"); - - ok($r->[0][0]); - - }; - }; - describe "#stage" => sub { context "when the item is not in repository" => sub { it "stages to the configured staging location" => sub { diff --git a/t/storage_pairtree_object_store.t b/t/storage_pairtree_object_store.t index da344c28..b3d76812 100644 --- a/t/storage_pairtree_object_store.t +++ b/t/storage_pairtree_object_store.t @@ -106,6 +106,8 @@ describe "HTFeed::Storage::PairtreeObjectStore" => sub { ok(-l "$bucket_dir/$pt_prefix/test"); }; + it "can roll back"; + }; runtests unless caller; From 01a1a9505b8031710e66bdd3290af6cb2c11b757 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Wed, 3 Jun 2026 16:35:29 -0400 Subject: [PATCH 2/4] clean up repository storage config This addresses two issues: * We are no longer using symlinks to deposit material into the repository. * When we read from the repo, we just care about the root of the repo (just like e.g. babel apps reading from the repo), not about any symlinks, etc. Specific changes: * remove LinkedPairtree * remove "repository" key in config & references to link_dir / obj_dir; replace with a "repository_root" key * TempDirs keeps track of what it creates; callers can create additional temp dirs that will get cleaned up at the end of a test. --- etc/config/storage.yml.sample | 2 - etc/config_audio.yml | 8 -- etc/config_ingest_docker.yml | 13 +-- etc/config_prevalidate.yml | 4 +- etc/config_test.yml | 9 +- lib/HTFeed/PackageType/HathiTrust/Volume.pm | 4 +- lib/HTFeed/Stage/Collate.pm | 6 +- lib/HTFeed/Storage.pm | 2 +- lib/HTFeed/Storage/LinkedPairtree.pm | 112 -------------------- lib/HTFeed/Storage/PrefixedVersions.pm | 3 - lib/HTFeed/Storage/RemotePairtree.pm | 13 --- lib/HTFeed/Volume.pm | 31 +++--- t/backup_expiration.t | 4 +- t/collate.t | 39 +++---- t/lib/HTFeed/Test/TempDirs.pm | 30 +++--- t/storage_audit.t | 4 +- t/storage_linked_pairtree.t | 91 ---------------- t/storage_migrate.t | 13 ++- t/storage_pairtree_object_store.t | 2 - t/truenas_audit.t | 16 +-- 20 files changed, 76 insertions(+), 330 deletions(-) delete mode 100644 etc/config_audio.yml delete mode 100644 lib/HTFeed/Storage/LinkedPairtree.pm delete mode 100644 lib/HTFeed/Storage/RemotePairtree.pm delete mode 100644 t/storage_linked_pairtree.t diff --git a/etc/config/storage.yml.sample b/etc/config/storage.yml.sample index 7602ac8b..521b8e17 100644 --- a/etc/config/storage.yml.sample +++ b/etc/config/storage.yml.sample @@ -3,8 +3,6 @@ storage_classes: localpairtree-ingest: class: HTFeed::Storage::LocalPairtree - # The directory in which symbolic links will be created to each volume - link_dir: /sdr2/obj # The directory into which volumes will be loaded obj_dir: /sdr1/obj prefixedversions-ingest: diff --git a/etc/config_audio.yml b/etc/config_audio.yml deleted file mode 100644 index 52950b9d..00000000 --- a/etc/config_audio.yml +++ /dev/null @@ -1,8 +0,0 @@ -repository: - # The directory in which symbolic links will be created to each volume - link_dir: /nonexistent - # The directory into which volumes will be loaded - obj_dir: /nonexistent - -sip_root: /tmp -stop_on_error: 0 diff --git a/etc/config_ingest_docker.yml b/etc/config_ingest_docker.yml index 54e32697..4d8e9c67 100644 --- a/etc/config_ingest_docker.yml +++ b/etc/config_ingest_docker.yml @@ -4,20 +4,15 @@ realdb: ht storage_classes: localpairtree-ingest: - class: HTFeed::Storage::LinkedPairtree - # The directory in which symbolic links will be created to each volume - link_dir: /sdr1/obj + class: HTFeed::Storage::LocalPairtree # The directory into which volumes will be loaded - obj_dir: /sdr2/obj + obj_dir: /sdr1/obj prefixedversions-ingest: class: HTFeed::Storage::PrefixedVersions obj_dir: /htdataden -repository: - # The directory in which symbolic links will be created to each volume - link_dir: /sdr1/obj - # The directory into which volumes will be loaded - obj_dir: /sdr2/obj +# The directory where previously-ingested items can be read from. +repository_root: /sdr1/obj handle: repo_url_base: https://babel.hathitrust.org/cgi/pt?id= diff --git a/etc/config_prevalidate.yml b/etc/config_prevalidate.yml index 80bfb3d1..3ed1abc1 100644 --- a/etc/config_prevalidate.yml +++ b/etc/config_prevalidate.yml @@ -3,8 +3,6 @@ sip_root: /tmp/stage storage_classes: -repository: - link_dir: /tmp/nonexistent - obj_dir: /tmp/nonexistent +repository_root: /tmp/nonexistent stop_on_error: 0 diff --git a/etc/config_test.yml b/etc/config_test.yml index 9704859e..7b703952 100644 --- a/etc/config_test.yml +++ b/etc/config_test.yml @@ -6,13 +6,8 @@ test_staging: test_fixtures: /usr/local/feed/etc/feed_fixtures.sql -repository: - # Directory through which volumes in the repository should be accessed - link_dir: /tmp/obj - # Paths for deposit -- deprecated -- configure deposit locations through - # storage classes instead - obj_dir: /tmp/obj - backup_obj_dir: /tmp/obj_backup +# Directory through which volumes in the repository should be accessed +repository_root: /tmp/obj storage_classes: pairtree-test: diff --git a/lib/HTFeed/PackageType/HathiTrust/Volume.pm b/lib/HTFeed/PackageType/HathiTrust/Volume.pm index 79e8b877..3e858ef2 100644 --- a/lib/HTFeed/PackageType/HathiTrust/Volume.pm +++ b/lib/HTFeed/PackageType/HathiTrust/Volume.pm @@ -183,14 +183,14 @@ sub clean_download { # Returns path to item in the repository rather than in the staging area sub get_mets_path { my $self = shift; - my $path = shift || $self->get_repository_symlink; + my $path = shift || $self->get_repository_path; return $self->SUPER::get_mets_path($path); } sub get_zip_path { my $self = shift; - my $path = shift || $self->get_repository_symlink(); + my $path = shift || $self->get_repository_path(); return $self->SUPER::get_zip_path($path); } diff --git a/lib/HTFeed/Stage/Collate.pm b/lib/HTFeed/Stage/Collate.pm index 9b76e278..678585f8 100644 --- a/lib/HTFeed/Stage/Collate.pm +++ b/lib/HTFeed/Stage/Collate.pm @@ -7,7 +7,6 @@ use base qw(HTFeed::Stage); use Carp qw(croak); use HTFeed::Config qw(get_config); -use HTFeed::Storage::LinkedPairtree; use HTFeed::Storage::LocalPairtree; use HTFeed::Storage::PairtreeObjectStore; use HTFeed::Storage::ObjectStore; @@ -61,9 +60,6 @@ sub run{ $self->set_error("OperationFailed",operation => "collate", detail => "collate to $storage->{name} failed; rolled back"); } - # TODO wait until *all* storages have succeeded to clean up *any* - # storages; roll back any that have failed - $self->log_repeat($storage); } $self->record_audit() if !$self->{failed}; @@ -168,7 +164,7 @@ sub record_audit { my $volume = $self->{volume}; - my $repo_path = $volume->get_repository_symlink(); + my $repo_path = $volume->get_repository_path(); my $zip_path = $volume->get_repository_zip_path; die("Zip missing (in $repo_path) after collate") unless $zip_path and -e $zip_path; diff --git a/lib/HTFeed/Storage.pm b/lib/HTFeed/Storage.pm index 047b665b..d4af927a 100644 --- a/lib/HTFeed/Storage.pm +++ b/lib/HTFeed/Storage.pm @@ -92,7 +92,7 @@ sub zip_source { sub encrypted_zip_staging { my $self = shift; - return $self->{volume}->get_zip_path(get_config('staging', 'zipfile')) . '.gpg'; + return $self->{volume}->get_zip_path(get_config('staging', 'zipfile')) . "-$self->{name}.gpg"; } sub encrypt { diff --git a/lib/HTFeed/Storage/LinkedPairtree.pm b/lib/HTFeed/Storage/LinkedPairtree.pm deleted file mode 100644 index 180317f8..00000000 --- a/lib/HTFeed/Storage/LinkedPairtree.pm +++ /dev/null @@ -1,112 +0,0 @@ -package HTFeed::Storage::LinkedPairtree; - -use HTFeed::Storage; -use base qw(HTFeed::Storage::LocalPairtree); - -use strict; -use Log::Log4perl qw(get_logger); -use File::Pairtree qw(id2ppath s2ppchars); -use File::Path qw(make_path); -use HTFeed::VolumeValidator; -use URI::Escape; -use POSIX qw(strftime); -use HTFeed::DBTools qw(get_dbh); - -sub stage_path { - my $self = shift; - - if(-l $self->link_path) { - $self->existing_object_tmpdir; - } else { - $self->SUPER::stage_path('obj_dir'); - } -} - -sub existing_object_tmpdir { - my $self = shift; - - my $objdir = $self->follow_existing_link; - - if($objdir =~ qr(^(.*)/$self->{namespace}/pairtree_root/.*)) { - get_logger()->trace("Using existing object dir $objdir; staging to $1/.tmp"); - return $self->stage_path_from_base($1); - } else { - die("Can't determine storage root from existing storage $objdir"); - } - -} - -sub make_object_path { - my $self = shift; - - $self->symlink_if_needed; - - return 1; -} - -sub follow_existing_link { - my $self = shift; - - my $object_path; - my $link_path = $self->link_path; - # set object directory to target of existing link - unless ($object_path = readlink($link_path)){ - # there is no good reason we chould have a dir and no link - $self->set_error('OperationFailed', operation => 'readlink', file => $link_path, detail => "readlink failed: $!") - } - - return $object_path; -} - -sub make_link { - my $self = shift; - my $object_path = shift; - my $link_path = shift; - - get_logger->trace("Symlinking $object_path to $link_path"); - symlink ($object_path, $link_path) - or $self->set_error('OperationFailed', operation => 'symlink', detail => "Could not symlink $object_path to $link_path $!"); -} - -sub link_parent { - my $self = shift; - return sprintf('%s/%s/%s',$self->{config}{link_dir},$self->{namespace},id2ppath($self->{objid})); -} - -sub link_path { - my $self = shift; - my $pt_objid = s2ppchars($self->{objid}); - - return $self->link_parent . $pt_objid; -}; - -sub symlink_if_needed { - my $self = shift; - - my $volume = $self->{volume}; - my $namespace = $self->{namespace}; - my $objid = $self->{objid}; - my $pt_objid = s2ppchars($objid); - - my $link_parent = $self->link_parent; - my $link_path = $self->link_path; - - if (-l $link_path){ - $self->{object_path} = $self->follow_existing_link; - } - else{ - my $object_path = $self->object_path; - $self->safe_make_path($object_path); - $self->safe_make_path($link_parent); - $self->make_link($object_path,$link_path); - } - - return 1; -} - -sub record_audit { - get_logger()->warn("LinkedPairtree is deprecated; not recording audit"); - return 1; -} - -1; diff --git a/lib/HTFeed/Storage/PrefixedVersions.pm b/lib/HTFeed/Storage/PrefixedVersions.pm index 220bc4d4..fdf9d78f 100644 --- a/lib/HTFeed/Storage/PrefixedVersions.pm +++ b/lib/HTFeed/Storage/PrefixedVersions.pm @@ -1,8 +1,5 @@ # Changes from local pairtree # -# No need to link (although we could do this by setting obj_dir and link_dir to -# be the same) -# # Path is computed using a prefix of the barcode rather than the complete barcode # # Filenames include the date stamp diff --git a/lib/HTFeed/Storage/RemotePairtree.pm b/lib/HTFeed/Storage/RemotePairtree.pm deleted file mode 100644 index 5f00a745..00000000 --- a/lib/HTFeed/Storage/RemotePairtree.pm +++ /dev/null @@ -1,13 +0,0 @@ -# Changes from local pairtree: -# -# safe_make_path needs to be able to do it remotely -# stage_path changes to include remote host -# checks in stage need to check directory existence on remote host -# validate needs to parse remote METS -# validate needs to run checksum remotely -# move needs to run remotely -# object path is probably OK -# symlink_if_needed/check_existing_link needs to check existence remotely & run remotely -# -# rsync instead of cp in stage - diff --git a/lib/HTFeed/Volume.pm b/lib/HTFeed/Volume.pm index e5db4a88..cf09d90c 100644 --- a/lib/HTFeed/Volume.pm +++ b/lib/HTFeed/Volume.pm @@ -559,38 +559,38 @@ sub get_marc_xml { } -sub get_repository_symlink { +sub get_repository_path { my $self = shift; # Memoize $self->{repository_symlink} - if (not defined $self->{repository_symlink}) { - my $link_dir = get_config("repository","link_dir"); + if (not defined $self->{repository_path}) { + my $repo_home = get_config("repository_root"); my $namespace = $self->get_namespace(); my $objid = $self->get_objid(); my $pairtree_path = id2ppath($objid); my $pt_objid = $self->get_pt_objid(); - $self->{repository_symlink} = join( + $self->{repository_path} = join( '/', - $link_dir, + $repo_home, $namespace, $pairtree_path, $pt_objid ); } - return $self->{repository_symlink}; + return $self->{repository_path}; } sub get_repository_mets_path { my $self = shift; - my $repos_symlink = $self->get_repository_symlink(); + my $repos_path = $self->get_repository_path(); - return unless (-l $repos_symlink or -d $repos_symlink); + return unless (-l $repos_path or -d $repos_path); my $mets_in_repository_file = sprintf( "%s/%s.mets.xml", - $repos_symlink, + $repos_path, $self->get_pt_objid() ); @@ -600,13 +600,13 @@ sub get_repository_mets_path { sub get_repository_zip_path { my $self = shift; - my $repos_symlink = $self->get_repository_symlink(); + my $repos_path = $self->get_repository_path(); - return unless (-l $repos_symlink or -d $repos_symlink); + return unless (-l $repos_path or -d $repos_path); my $zip_in_repository_file = sprintf( "%s/%s.zip", - $repos_symlink, + $repos_path, $self->get_pt_objid() ); @@ -1022,7 +1022,7 @@ sub clean_download { sub ingested{ my $self = shift; - my $link = $self->get_repository_symlink(); + my $link = $self->get_repository_path(); return 1 if (-e $link); return; @@ -1227,10 +1227,9 @@ Get all files that should be valid UTF-8 Returns an XML::LibXML node with the MARCXML -=item get_repository_symlink() +=item get_repository_path() -Returns the path to the repository symlink for the object. -(or the directory if the repository does not use symlinks) +Returns the path to the directory or symlink in the repository for the object. =item get_repository_mets_path() diff --git a/t/backup_expiration.t b/t/backup_expiration.t index 709d9891..52bd48b5 100644 --- a/t/backup_expiration.t +++ b/t/backup_expiration.t @@ -17,11 +17,13 @@ describe "HTFeed::BackupExpiration" => sub { before each => sub { $old_storage_classes = get_config('storage_classes'); + my $backup_dir = $tmpdirs->dir_for("backup"); + my $new_storage_classes = { 'prefixedversions-test' => { class => 'HTFeed::Storage::PrefixedVersions', - obj_dir => $tmpdirs->{backup_obj_dir}, + obj_dir => $backup_dir, encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" }, 'objectstore-test' => diff --git a/t/collate.t b/t/collate.t index 5f7365a9..7873f792 100644 --- a/t/collate.t +++ b/t/collate.t @@ -21,6 +21,7 @@ describe "HTFeed::Collate" => sub { my $storage = Test::MockObject->new(); $storage->set_true(qw(stage validate_zip_completeness prevalidate make_object_path move postvalidate record_audit cleanup rollback clean_staging encrypt verify_crypt)); $storage->{name} = "mock_storage"; + $storage->{errors} = []; return $storage; } @@ -33,7 +34,6 @@ describe "HTFeed::Collate" => sub { packagetype => 'simple'); $collate = HTFeed::Stage::Collate->new(volume => $volume); - # TODO remove repo config for obj dir vs. link dir # need to have something here so record_audit will be happy my $obj_path = $tmpdirs->{obj_dir} . "/test/pairtree_root/te/st/test"; make_path($obj_path); @@ -103,7 +103,7 @@ describe "HTFeed::Collate" => sub { $storage->set_false('postvalidate'); }; - it "rolls back to the existing version" => sub { + it "calls rollback (may be a noop)" => sub { eval { $collate->run($storage) }; ok($storage->called('rollback')); @@ -121,18 +121,6 @@ describe "HTFeed::Collate" => sub { }; }; - it "rolls back first storage if second storage fails" => sub { - my $storage1 = mocked_storage(); - $storage1->{name} = "storage1"; - my $storage2 = mocked_storage(); - $storage2->{name} = "storage2"; - $storage2->set_false('postvalidate'); - - eval { $collate->run($storage1,$storage2) }; - ok($storage1->called('rollback')); - ok($storage2->called('rollback')); - }; - context "when everything succeeds" => sub { it "encrypts the item" => sub { $collate->run($storage); @@ -207,7 +195,7 @@ describe "HTFeed::Collate" => sub { my $volume = stage_volume($tmpdirs,'test','test'); my $storage = HTFeed::Storage::LocalPairtree->new( volume => $volume, - config => { obj_dir => get_config('repository','obj_dir') }, + config => { obj_dir => get_config('repository_root') }, name => "localpairtree_test" ); my $stage = HTFeed::Stage::Collate->new(volume => $volume); @@ -225,7 +213,7 @@ describe "HTFeed::Collate" => sub { local our ($bucket, $s3); my $old_storage_classes; - my $old_repository_dirs; + my $old_repository_root; my %s3s; before all => sub { @@ -247,7 +235,9 @@ describe "HTFeed::Collate" => sub { before each => sub { $old_storage_classes = get_config('storage_classes'); - $old_repository_dirs = get_config("repository"); + $old_repository_root = get_config('repository_root'); + + my $backup_dir = $tmpdirs->dir_for("backup"); my $new_storage_classes = { # simulating truenas (site 1) @@ -266,7 +256,7 @@ describe "HTFeed::Collate" => sub { 'prefixedversions-test' => { class => 'HTFeed::Storage::PrefixedVersions', - obj_dir => $tmpdirs->{backup_obj_dir}, + obj_dir => $backup_dir, encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" }, # simulating glacier deep archive @@ -283,15 +273,12 @@ describe "HTFeed::Collate" => sub { my $bucket_dir = "$vgw_home/$s3s{ptobj1}->{bucket}"; set_config($new_storage_classes,'storage_classes'); - set_config({ - obj_dir => $bucket_dir, - link_dir => $bucket_dir - }, "repository"); + set_config($bucket_dir, 'repository_root'); }; after each => sub { set_config($old_storage_classes,'storage_classes'); - set_config($old_repository_dirs,'repository'); + set_config($old_repository_root,'repository_root'); }; it "copies and records to all configured storages" => sub { @@ -301,7 +288,7 @@ describe "HTFeed::Collate" => sub { my $dbh = get_dbh(); my $audits = $dbh->selectall_arrayref("SELECT * from feed_audit WHERE namespace = 'test' and id = 'test'"); - my $versioned_backup = $dbh->selectall_arrayref("SELECT version from feed_backups WHERE namespace = 'test' and id = 'test' and path like ?",undef,$tmpdirs->{backup_obj_dir} . '%'); + my $versioned_backup = $dbh->selectall_arrayref("SELECT version from feed_backups WHERE namespace = 'test' and id = 'test' and path like ?",undef,$tmpdirs->{backup} . '%'); my $s3_backup = $dbh->selectall_arrayref("SELECT version from feed_backups WHERE namespace = 'test' and id = 'test' and path like ?",undef,"s3://$bucket%"); is(scalar(@{$audits}),1,'records an audit'); @@ -312,8 +299,8 @@ describe "HTFeed::Collate" => sub { my $pt_path = "test/pairtree_root/te/st/test"; - ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.zip.gpg","copies the encrypted zip to backup storage"); - ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.mets.xml","copies the mets backup storage"); + ok(-e "$tmpdirs->{backup}/test/tes/test.$timestamp.zip.gpg","copies the encrypted zip to backup storage"); + ok(-e "$tmpdirs->{backup}/test/tes/test.$timestamp.mets.xml","copies the mets backup storage"); my $s3_timestamp = $s3_backup->[0][0]; diff --git a/t/lib/HTFeed/Test/TempDirs.pm b/t/lib/HTFeed/Test/TempDirs.pm index cb8c983a..4a5fc7e3 100644 --- a/t/lib/HTFeed/Test/TempDirs.pm +++ b/t/lib/HTFeed/Test/TempDirs.pm @@ -18,27 +18,26 @@ sub new { $self->{test_home} = abs_path($FindBin::Bin); $self->{tmpdir} = tempdir("feed-test-XXXXXX",TMPDIR => 1); + $self->{dirtypes} = []; return bless ($self, $class); } sub test_home { my $self = shift; - return $self->{test_home}; } +sub storage_dirtypes { + return qw(obj_dir backup); +} + sub staging_dirtypes { my $self = shift; return qw(ingest preingest zipfile zip download ingested punted fetch); } -sub repo_dirtypes { - my $self = shift; - return qw(obj_dir other_obj_dir backup_obj_dir); -} - sub cleanup { my $self = shift; @@ -53,13 +52,11 @@ sub setup_example { set_config($self->{$dirtype},'staging',$dirtype); } - foreach my $dirtype ($self->repo_dirtypes) { + foreach my $dirtype ($self->storage_dirtypes) { $self->{$dirtype} = $self->dir_for($dirtype); - set_config($self->{$dirtype},'repository',$dirtype); } - # We no longer create symlinks, so obj_dir and link_dir should be the same. - set_config($self->{obj_dir},'repository','link_dir'); + set_config($self->{obj_dir},"repository_root"); } sub dir_for { @@ -67,17 +64,24 @@ sub dir_for { my $dirtype = shift; my $tmpdir = $self->{tmpdir}; + my $subdir = tempdir("$tmpdir/feed-test-$dirtype-XXXXXX"); - return tempdir("$tmpdir/feed-test-$dirtype-XXXXXX"); + $self->{$dirtype} = $subdir; + push(@{$self->{dirtypes}}, $dirtype); + + return $subdir; } sub cleanup_example { my $self = shift; - foreach my $dirtype ($self->staging_dirtypes, $self->repo_dirtypes) { - remove_tree $self->{$dirtype}; + foreach my $dirtype (@{$self->{dirtypes}}) { + my $dir = $self->{$dirtype}; + remove_tree $dir if defined $dir and -d $dir; } + $self->{dirtypes} = []; + } 1; diff --git a/t/storage_audit.t b/t/storage_audit.t index cf270590..0ed1ac5e 100644 --- a/t/storage_audit.t +++ b/t/storage_audit.t @@ -16,11 +16,13 @@ describe "HTFeed::StorageAudit" => sub { before each => sub { $old_storage_classes = get_config('storage_classes'); + my $backup_dir = $tmpdirs->dir_for("backup"); + my $new_storage_classes = { 'prefixedversions-test' => { class => 'HTFeed::Storage::PrefixedVersions', - obj_dir => $tmpdirs->{backup_obj_dir} . '/obj', + obj_dir => $backup_dir . '/obj', encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" }, 'objectstore-test' => diff --git a/t/storage_linked_pairtree.t b/t/storage_linked_pairtree.t deleted file mode 100644 index 300050b6..00000000 --- a/t/storage_linked_pairtree.t +++ /dev/null @@ -1,91 +0,0 @@ -use Test::Spec; -use HTFeed::Storage::LinkedPairtree; - -use strict; - -describe "HTFeed::Storage::LinkedPairtree" => sub { - spec_helper 'storage_helper.pl'; - local our ($tmpdirs, $testlog); - - sub make_old_version_other_dir { - my $volume = stage_volume($tmpdirs,@_); - - my $storage = HTFeed::Storage::LinkedPairtree->new( - name => 'linkedpairtree-test', - volume => $volume, - config => { - obj_dir => $tmpdirs->{other_obj_dir}, - link_dir => $tmpdirs->{link_dir} - } - ); - - make_old_version($storage); - } - - sub linked_storage { - my $volume = stage_volume($tmpdirs,@_); - - my $storage = HTFeed::Storage::LinkedPairtree->new( - name => 'linkedpairtree-test', - volume => $volume, - config => { - obj_dir => $tmpdirs->{obj_dir}, - link_dir => $tmpdirs->{link_dir} - } - ); - - return $storage; - } - - it "moves the existing version aside when the link target doesn't match current objdir" => sub { - make_old_version_other_dir('test','test'); - my $storage = linked_storage( 'test', 'test'); - $storage->stage; - $storage->make_object_path; - $storage->move; - - ok(-e "$tmpdirs->{other_obj_dir}/test/pairtree_root/te/st/test/test.mets.xml.old"); - ok(-e "$tmpdirs->{other_obj_dir}/test/pairtree_root/te/st/test/test.zip.old"); - - }; - - describe "#make_object_path" => sub { - - context "when the object is not in the repo" => sub { - it "creates a symlink for the volume" => sub { - my $storage = linked_storage('test','test'); - $storage->make_object_path; - - is("$tmpdirs->{obj_dir}/test/pairtree_root/te/st/test", - readlink("$tmpdirs->{link_dir}/test/pairtree_root/te/st/test")); - }; - }; - - - context "when the object is in the repo but link target doesn't match current obj dir" => sub { - it "uses existing target of the link" => sub { - make_old_version_other_dir('test','test'); - - my $storage = linked_storage('test','test'); - $storage->make_object_path; - - is($storage->object_path,"$tmpdirs->{other_obj_dir}/test/pairtree_root/te/st/test"); - }; - }; - }; - - describe "#stage" => sub { - context "when the item is in the repository with a different storage path" => sub { - it "deposits to a staging area under that path" => sub { - make_old_version_other_dir('test','test'); - my $storage = linked_storage( 'test', 'test'); - $storage->stage; - - ok(-e "$tmpdirs->{other_obj_dir}/.tmp/test.test/test.mets.xml"); - ok(-e "$tmpdirs->{other_obj_dir}/.tmp/test.test/test.zip"); - }; - }; - }; -}; - -runtests unless caller; diff --git a/t/storage_migrate.t b/t/storage_migrate.t index 92a25a3f..909572b0 100644 --- a/t/storage_migrate.t +++ b/t/storage_migrate.t @@ -23,11 +23,13 @@ describe "HTFeed::Stage::StorageMigrate" => sub { before each => sub { $old_storage_classes = get_config('storage_migrate'); + my $backup_dir = $tmpdirs->dir_for("backup"); + my $new_storage_classes = { 'prefixedversions-test' => { class => 'HTFeed::Storage::PrefixedVersions', - obj_dir => $tmpdirs->{backup_obj_dir}, + obj_dir => $backup_dir, encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" }, 'objectstore-test' => @@ -54,14 +56,11 @@ describe "HTFeed::Stage::StorageMigrate" => sub { my $pt_path = id2ppath($objid); my $objdir = "$tmpdirs->{obj_dir}/test/$pt_path/$pt_objid"; - my $link_base = "$tmpdirs->{link_dir}/test/$pt_path"; my $mets = $tmpdirs->test_home . "/fixtures/volumes/$pt_objid.mets.xml"; my $zip = $tmpdirs->test_home . "/fixtures/volumes/$pt_objid.zip"; system("mkdir -p $objdir"); - system("mkdir -p $link_base"); - system("ln -s $objdir $link_base"); system("cp $zip $objdir/$pt_objid.zip"); system("cp $mets $objdir/$pt_objid.mets.xml"); } @@ -83,15 +82,15 @@ describe "HTFeed::Stage::StorageMigrate" => sub { my $dbh = get_dbh(); my $audits = $dbh->selectall_arrayref("SELECT * from feed_audit WHERE namespace = '$namespace' and id = '$objid'"); - my $versioned_backup = $dbh->selectall_arrayref("SELECT version from feed_backups WHERE namespace = '$namespace' and id = '$objid' and path like ?",undef,$tmpdirs->{backup_obj_dir} . '%'); + my $versioned_backup = $dbh->selectall_arrayref("SELECT version from feed_backups WHERE namespace = '$namespace' and id = '$objid' and path like ?",undef,$tmpdirs->{backup} . '%'); my $s3_backup = $dbh->selectall_arrayref("SELECT version from feed_backups WHERE namespace = '$namespace' and id = '$objid' and path like ?",undef,"s3://$bucket%"); is(scalar(@{$versioned_backup}),1,'records a backup for versioned pairtree'); is(scalar(@{$s3_backup}),1,'records a backup for object store'); my $timestamp = $versioned_backup->[0][0]; - ok(-e "$tmpdirs->{backup_obj_dir}/$obj_path/$pt_objid.$timestamp.zip.gpg","copies the encrypted zip to backup storage"); - ok(-e "$tmpdirs->{backup_obj_dir}/$obj_path/$pt_objid.$timestamp.mets.xml","copies the mets backup storage"); + ok(-e "$tmpdirs->{backup}/$obj_path/$pt_objid.$timestamp.zip.gpg","copies the encrypted zip to backup storage"); + ok(-e "$tmpdirs->{backup}/$obj_path/$pt_objid.$timestamp.mets.xml","copies the mets backup storage"); my $s3_timestamp = $s3_backup->[0][0]; ok($s3->s3_has("$namespace.$pt_objid.$s3_timestamp.zip.gpg"),"copies the zip to s3"); diff --git a/t/storage_pairtree_object_store.t b/t/storage_pairtree_object_store.t index b3d76812..da344c28 100644 --- a/t/storage_pairtree_object_store.t +++ b/t/storage_pairtree_object_store.t @@ -106,8 +106,6 @@ describe "HTFeed::Storage::PairtreeObjectStore" => sub { ok(-l "$bucket_dir/$pt_prefix/test"); }; - it "can roll back"; - }; runtests unless caller; diff --git a/t/truenas_audit.t b/t/truenas_audit.t index 77843203..b7cba715 100644 --- a/t/truenas_audit.t +++ b/t/truenas_audit.t @@ -82,14 +82,14 @@ describe "bin/audit/main_repo_audit.pl" => sub { ); } - sub temp_link_path { + sub temp_repo_root { my $namespace = shift || 'test'; my $objid = shift || 'test'; return File::Spec->catfile( File::Spec->rootdir, 'tmp', - 'obj_link', + 'obj', $namespace, id2ppath($objid), s2ppchars($objid) @@ -106,19 +106,19 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $sdr2_path = temp_sdr_path(2); my $sdr1_obj_path = temp_sdr_obj_path(1,$namespace,$objid); my $sdr2_obj_path = temp_sdr_obj_path(2,$namespace,$objid); - my $temp_link_path = temp_link_path($namespace,$objid); + my $temp_repo_root = temp_repo_root($namespace,$objid); File::Path::make_path("$sdr2_obj_path"); system("cp -r $tmpdirs->{obj_dir}/* $sdr2_path/obj/"); - # Symlink into obj_link so Volume.pm can find the files, + # Symlink into obj so Volume.pm can find the files, # and into sdr1 for symlink checks inside truenas_audit.pl # Create directory structures but remove the leaf node so we can recreate it as a symlink. # This is kind of silly but trying to create a partial path would be messier. - File::Path::make_path($temp_link_path); - File::Path::remove_tree($temp_link_path); + File::Path::make_path($temp_repo_root); + File::Path::remove_tree($temp_repo_root); File::Path::make_path($sdr1_obj_path); File::Path::remove_tree($sdr1_obj_path); - system("ln -sf $sdr2_obj_path $temp_link_path"); + system("ln -sf $sdr2_obj_path $temp_repo_root"); system("ln -sf $sdr2_obj_path $sdr1_obj_path"); } @@ -143,7 +143,7 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $tmp_home = $tmpdirs->{tmpdir}; File::Path::remove_tree("$tmp_home/sdr/1"); File::Path::remove_tree("$tmp_home/sdr/2"); - File::Path::remove_tree('/tmp/obj_link'); + File::Path::remove_tree('/tmp/obj'); get_dbh->prepare('DELETE FROM feed_storage')->execute; get_dbh->prepare('DELETE FROM feed_audit_detail')->execute; }; From 0f37645efd7f02f91f04844fc395b73179b76c31 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Thu, 4 Jun 2026 15:46:09 -0400 Subject: [PATCH 3/4] ETT-824: record how long collate takes --- lib/HTFeed/Stage/Collate.pm | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/HTFeed/Stage/Collate.pm b/lib/HTFeed/Stage/Collate.pm index 678585f8..226d0a31 100644 --- a/lib/HTFeed/Stage/Collate.pm +++ b/lib/HTFeed/Stage/Collate.pm @@ -14,6 +14,7 @@ use HTFeed::Storage::PrefixedVersions; use Log::Log4perl qw(get_logger); use POSIX qw(strftime); use HTFeed::DBTools qw(get_dbh); +use Time::HiRes; =head1 NAME @@ -41,9 +42,11 @@ sub run{ foreach my $storage (@storages) { my $rolled_back = 0; + my $start_time = Time::HiRes::time(); if ($self->collate($storage)) { - # TODO log how long it took - $self->log_info("finished collate to $storage->{name}, cleaning up"); + my $end_time = Time::HiRes::time(); + my $delta_time = $end_time - $start_time; + $self->log_info("finished collate to $storage->{name}, delta $delta_time, cleaning up"); $storage->cleanup } else { $self->log_warn("collate to $storage->{name} failed, rolling back"); From a7bd638a13b0844bed92140f22489404fa26c3ed Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Thu, 4 Jun 2026 16:35:24 -0400 Subject: [PATCH 4/4] DRY logging in stage --- lib/HTFeed/Stage.pm | 64 +++++++++++++++++++------------------ lib/HTFeed/Stage/Collate.pm | 5 ++- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/lib/HTFeed/Stage.pm b/lib/HTFeed/Stage.pm index 7be10e19..08da9ea8 100644 --- a/lib/HTFeed/Stage.pm +++ b/lib/HTFeed/Stage.pm @@ -13,6 +13,7 @@ use File::Find; use HTFeed::Config qw(get_config); use HTFeed::JobMetrics; use Log::Log4perl qw(get_logger); +use Log::Log4perl::Level; use POSIX qw(ceil); sub new { @@ -80,22 +81,26 @@ sub failed { return 1; } +sub log{ + my $self = shift; + my $level = shift; + my $message = shift; + + get_logger(ref($self))->log($level, + $message, + namespace => $self->{volume}->get_namespace(), + objid => $self->{volume}->get_objid(), + stage => ref($self), + @_ + ); + +} -# FIXME deduplicate sub set_error { my $self = shift; - my $error = shift; $self->{failed}++; - # log error w/ l4p - my $logger = get_logger( ref($self) ); - $logger->error( - $error, - namespace => $self->{volume}->get_namespace(), - objid => $self->{volume}->get_objid(), - stage => ref($self), - @_ - ); + $self->log($ERROR, @_); if ( get_config('stop_on_error') ) { croak("STAGE_ERROR"); @@ -104,32 +109,29 @@ sub set_error { sub log_warn { my $self = shift; - my $message = shift; - - my $logger = get_logger( ref($self) ); - $logger->warn( - $message, - namespace => $self->{volume}->get_namespace(), - objid => $self->{volume}->get_objid(), - stage => ref($self), - @_ - ); + + $self->log($WARN, @_); } sub log_info { my $self = shift; - my $message = shift; - - my $logger = get_logger( ref($self) ); - $logger->info( - $message, - namespace => $self->{volume}->get_namespace(), - objid => $self->{volume}->get_objid(), - stage => ref($self), - @_ - ); + + $self->log($INFO, @_); +} + +sub log_debug { + my $self = shift; + + $self->log($DEBUG, @_); } +sub log_trace { + my $self = shift; + + $self->log($TRACE, @_); +} + + sub clean { my $self = shift; my $success = $self->succeeded(); diff --git a/lib/HTFeed/Stage/Collate.pm b/lib/HTFeed/Stage/Collate.pm index 226d0a31..18b01ede 100644 --- a/lib/HTFeed/Stage/Collate.pm +++ b/lib/HTFeed/Stage/Collate.pm @@ -11,7 +11,6 @@ use HTFeed::Storage::LocalPairtree; use HTFeed::Storage::PairtreeObjectStore; use HTFeed::Storage::ObjectStore; use HTFeed::Storage::PrefixedVersions; -use Log::Log4perl qw(get_logger); use POSIX qw(strftime); use HTFeed::DBTools qw(get_dbh); use Time::HiRes; @@ -179,8 +178,8 @@ sub record_audit { my $metssize = -s $mets_path; my $metsdate = $self->file_date($mets_path); my $sth = get_dbh()->prepare($stmt); - get_logger()->trace("feed_audit: $zip_path / $zipdate / $zipsize bytes"); - get_logger()->trace("feed_audit: $mets_path / $metsdate / $metssize bytes"); + $self->log_trace("feed_audit: $zip_path / $zipdate / $zipsize bytes"); + $self->log_trace("feed_audit: $mets_path / $metsdate / $metssize bytes"); my $res = $sth->execute( $volume->{namespace}, $volume->{objid}, $zipsize, $zipdate, $metssize, $metsdate,