Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions etc/config/storage.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
storage_classes:
localpairtree-ingest:
class: HTFeed::Storage::LocalPairtree
# The directory in which symbolic links will be created to each volume
link_dir: /sdr2/obj
# The directory into which volumes will be loaded
obj_dir: /sdr1/obj
prefixedversions-ingest:
Expand Down
8 changes: 0 additions & 8 deletions etc/config_audio.yml

This file was deleted.

13 changes: 4 additions & 9 deletions etc/config_ingest_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,15 @@ realdb: ht

storage_classes:
localpairtree-ingest:
class: HTFeed::Storage::LinkedPairtree
# The directory in which symbolic links will be created to each volume
link_dir: /sdr1/obj
class: HTFeed::Storage::LocalPairtree
# The directory into which volumes will be loaded
obj_dir: /sdr2/obj
obj_dir: /sdr1/obj
prefixedversions-ingest:
class: HTFeed::Storage::PrefixedVersions
obj_dir: /htdataden

repository:
# The directory in which symbolic links will be created to each volume
link_dir: /sdr1/obj
# The directory into which volumes will be loaded
obj_dir: /sdr2/obj
# The directory where previously-ingested items can be read from.
repository_root: /sdr1/obj

handle:
repo_url_base: https://babel.hathitrust.org/cgi/pt?id=
Expand Down
4 changes: 1 addition & 3 deletions etc/config_prevalidate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ sip_root: /tmp/stage

storage_classes:

repository:
link_dir: /tmp/nonexistent
obj_dir: /tmp/nonexistent
repository_root: /tmp/nonexistent

stop_on_error: 0
14 changes: 4 additions & 10 deletions etc/config_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,12 @@ test_staging:

test_fixtures: /usr/local/feed/etc/feed_fixtures.sql

repository:
# The directory in which symbolic links will be created to each volume
link_dir: /tmp/obj_link
# The directory into which volumes will be loaded
obj_dir: /tmp/obj
backup_obj_dir: /tmp/obj_backup
# Directory through which volumes in the repository should be accessed
repository_root: /tmp/obj

storage_classes:
linkedpairtree-test:
class: HTFeed::Storage::LinkedPairtree
# The directory in which symbolic links will be created to each volume
link_dir: /tmp/obj_link
pairtree-test:
class: HTFeed::Storage::LocalPairtree
# The directory into which volumes will be loaded
obj_dir: /tmp/obj

Expand Down
4 changes: 3 additions & 1 deletion etc/ingest.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ CREATE TABLE IF NOT EXISTS `feed_audit` (
`id` varchar(30) NOT NULL,
`sdr_partition` tinyint(4) DEFAULT NULL,
`zip_size` bigint(20) DEFAULT NULL,
`first_ingest_date` datetime NULL DEFAULT CURRENT_TIMESTAMP,

@aelkiss aelkiss Jun 4, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the change for recording first ingest date; the application side doesn't need to handle it directly at all beyond making sure that something is recorded in feed_audit

`image_size` bigint(20) DEFAULT NULL,
`zip_date` datetime DEFAULT NULL,
`mets_size` bigint(20) DEFAULT NULL,
Expand All @@ -15,7 +16,8 @@ CREATE TABLE IF NOT EXISTS `feed_audit` (
`md5check_ok` tinyint(1) DEFAULT NULL,
`is_tombstoned` tinyint(1) DEFAULT NULL,
PRIMARY KEY (`namespace`,`id`),
KEY `feed_audit_zip_date_idx` (`zip_date`)
KEY `feed_audit_zip_date_idx` (`zip_date`),
KEY `feed_audit_first_ingest_date_idx` (`first_ingest_date`)
);

CREATE TABLE IF NOT EXISTS `feed_queue_disallow` (
Expand Down
4 changes: 2 additions & 2 deletions lib/HTFeed/PackageType/HathiTrust/Volume.pm
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,14 @@ sub clean_download {
# Returns path to item in the repository rather than in the staging area
sub get_mets_path {
my $self = shift;
my $path = shift || $self->get_repository_symlink;
my $path = shift || $self->get_repository_path;

return $self->SUPER::get_mets_path($path);
}

sub get_zip_path {
my $self = shift;
my $path = shift || $self->get_repository_symlink();
my $path = shift || $self->get_repository_path();

return $self->SUPER::get_zip_path($path);
}
Expand Down
61 changes: 39 additions & 22 deletions lib/HTFeed/Stage.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use File::Find;
use HTFeed::Config qw(get_config);
use HTFeed::JobMetrics;
use Log::Log4perl qw(get_logger);
use Log::Log4perl::Level;
use POSIX qw(ceil);

sub new {
Expand Down Expand Up @@ -80,41 +81,57 @@ sub failed {
return 1;
}

sub log{
my $self = shift;
my $level = shift;
my $message = shift;

get_logger(ref($self))->log($level,
$message,
namespace => $self->{volume}->get_namespace(),
objid => $self->{volume}->get_objid(),
stage => ref($self),
@_
);

}

sub set_error {
my $self = shift;
my $error = shift;
$self->{failed}++;

# log error w/ l4p
my $logger = get_logger( ref($self) );
$logger->error(
$error,
namespace => $self->{volume}->get_namespace(),
objid => $self->{volume}->get_objid(),
stage => ref($self),
@_
);
$self->log($ERROR, @_);

if ( get_config('stop_on_error') ) {
croak("STAGE_ERROR");
}
}

sub set_info {
sub log_warn {
my $self = shift;

$self->log($WARN, @_);
}

sub log_info {
my $self = shift;

$self->log($INFO, @_);
}

sub log_debug {
my $self = shift;
my $message = shift;

my $logger = get_logger( ref($self) );
$logger->info(
'Info',
detail => $message,
namespace => $self->{volume}->get_namespace(),
objid => $self->{volume}->get_objid(),
stage => ref($self),
@_
);

$self->log($DEBUG, @_);
}

sub log_trace {
my $self = shift;

$self->log($TRACE, @_);
}


sub clean {
my $self = shift;
my $success = $self->succeeded();
Expand Down
77 changes: 71 additions & 6 deletions lib/HTFeed/Stage/Collate.pm
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ use base qw(HTFeed::Stage);

use Carp qw(croak);
use HTFeed::Config qw(get_config);
use HTFeed::Storage::LinkedPairtree;
use HTFeed::Storage::LocalPairtree;
use HTFeed::Storage::PairtreeObjectStore;
use HTFeed::Storage::ObjectStore;
use HTFeed::Storage::PrefixedVersions;
use Log::Log4perl qw(get_logger);
use POSIX qw(strftime);
use HTFeed::DBTools qw(get_dbh);
use Time::HiRes;

=head1 NAME

Expand All @@ -38,18 +39,32 @@ sub run{
@storages = $self->storages unless @storages;

foreach my $storage (@storages) {
my $rolled_back = 0;

my $start_time = Time::HiRes::time();
if ($self->collate($storage)) {
get_logger->trace("finished collate to $storage, cleaning up");
my $end_time = Time::HiRes::time();
my $delta_time = $end_time - $start_time;
$self->log_info("finished collate to $storage->{name}, delta $delta_time, cleaning up");
$storage->cleanup
} else {
get_logger->warn("collate to $storage failed, rolling back");
$self->log_warn("collate to $storage->{name} failed, rolling back");
$storage->rollback;
$rolled_back = 1;
}

$storage->clean_staging();
$self->check_errors($storage);

# If collate returned false but didn't raise an error, we still need to
# record that the stage failed
if($rolled_back) {
$self->set_error("OperationFailed",operation => "collate", detail => "collate to $storage->{name} failed; rolled back");
}

$self->log_repeat($storage);
}
$self->record_audit() if !$self->{failed};
$self->_set_done();
$self->{job_metrics}->inc("ingest_collate_items_total");

Expand All @@ -64,7 +79,8 @@ sub log_repeat {

if (-e $volume->get_zip_path() && -e $volume->get_mets_path()) {
$self->{is_repeat} = 1;
$self->set_info('Collating volume that is already in repo');
# deprecated format
$self->log_info('Collating volume that is already in repo');
}

}
Expand All @@ -73,7 +89,7 @@ sub collate {
my $self = shift;
my $storage = shift;

get_logger->trace("Starting collate for $storage");
$self->log_info("Starting collate for $storage->{name}");

$storage->validate_zip_completeness &&
$storage->encrypt &&
Expand Down Expand Up @@ -125,4 +141,53 @@ sub clean_success {
$self->{volume}->clean_sip_success();
}


sub file_date {
my $self = shift;
my $file = shift;

if (-e $file) {
my $seconds = (stat($file))[9];
return strftime("%Y-%m-%d %H:%M:%S", localtime($seconds));
}
}

# updates the zip_date in the feed_audit table to the current timestamp for
# this zip in the repository
#
# first_ingest_date is set by default to CURRENT_TIMESTAMP on first insert
sub record_audit {
my $self = shift;

my $stmt =
"insert into feed_audit (namespace, id, zip_size, zip_date, mets_size, mets_date) \
values(?,?,?,?,?,?) \
ON DUPLICATE KEY UPDATE zip_size=?, zip_date =?,mets_size=?,mets_date=?";

my $volume = $self->{volume};

my $repo_path = $volume->get_repository_path();
my $zip_path = $volume->get_repository_zip_path;
die("Zip missing (in $repo_path) after collate") unless $zip_path and -e $zip_path;

my $mets_path = $volume->get_repository_mets_path;
die("METS missing (in $repo_path) after collate") unless $mets_path and -e $mets_path;

my $zipsize = -s $zip_path;
my $zipdate = $self->file_date($zip_path);
my $metssize = -s $mets_path;
my $metsdate = $self->file_date($mets_path);
my $sth = get_dbh()->prepare($stmt);
$self->log_trace("feed_audit: $zip_path / $zipdate / $zipsize bytes");
$self->log_trace("feed_audit: $mets_path / $metsdate / $metssize bytes");
my $res = $sth->execute(
$volume->{namespace}, $volume->{objid},
$zipsize, $zipdate, $metssize, $metsdate,
# duplicate parameters for duplicate key update
$zipsize, $zipdate, $metssize, $metsdate
);

return $res;
}

1;
2 changes: 1 addition & 1 deletion lib/HTFeed/Storage.pm
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ sub zip_source {
sub encrypted_zip_staging {
my $self = shift;

return $self->{volume}->get_zip_path(get_config('staging', 'zipfile')) . '.gpg';
return $self->{volume}->get_zip_path(get_config('staging', 'zipfile')) . "-$self->{name}.gpg";

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This avoids collisions with encrypted zips left over from other storages. They should get cleaned up but don't always in practice.

}

sub encrypt {
Expand Down
Loading