From c64e5406d54bb8dc92154b57d07408a92385a353 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Tue, 16 Jun 2026 10:05:44 +1000 Subject: [PATCH 1/2] Replace standardise_mom6_filenames.sh with a new script that: - Supports more than just annual timestamps (e.g. _2023_01_01) - Supports OM2 output as well --- payu_config/archive.sh | 2 +- .../standardise_mom6_filenames.sh | 53 ------ .../standardise_mom_filenames.py | 95 ++++++++++ test/test_payu_conf/test_mom6_filenames.py | 179 ------------------ test/test_payu_conf/test_mom_filenames.py | 149 +++++++++++++++ 5 files changed, 245 insertions(+), 233 deletions(-) delete mode 100755 payu_config/archive_scripts/standardise_mom6_filenames.sh create mode 100644 payu_config/archive_scripts/standardise_mom_filenames.py delete mode 100644 test/test_payu_conf/test_mom6_filenames.py create mode 100644 test/test_payu_conf/test_mom_filenames.py diff --git a/payu_config/archive.sh b/payu_config/archive.sh index 5d0719c6..52b3dcb0 100644 --- a/payu_config/archive.sh +++ b/payu_config/archive.sh @@ -2,4 +2,4 @@ # Copyright 2025 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. # SPDX-License-Identifier: Apache-2.0 -source $(dirname "$0")/archive_scripts/standardise_mom6_filenames.sh \ No newline at end of file +python3 $(dirname "$0")/archive_scripts/standardise_mom_filenames.py \ No newline at end of file diff --git a/payu_config/archive_scripts/standardise_mom6_filenames.sh b/payu_config/archive_scripts/standardise_mom6_filenames.sh deleted file mode 100755 index 2690f11c..00000000 --- a/payu_config/archive_scripts/standardise_mom6_filenames.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/bash -# Copyright 2024 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. -# SPDX-License-Identifier: Apache-2.0. -# -# Standardise file naming for MOM6 output files in access-om3 by removing the underscore before the four-digit year, i.e., replacing '_YYYY' with 'YYYY' -# This was written assuming it would be used as a payu "userscript" at the "archive" stage, but alternatively a path to an "archive" directory can be provided. -# For more details, see https://github.com/COSIMA/om3-scripts/issues/32 - -Help() -{ - # Display help - echo -e "Standardise file naming for MOM6 output files.\n" - echo "Syntax: scriptTemplate [-h|d DIRECTORY]" - echo "options:" - echo "h Print this help message." - echo -e "d Process files in the specified 'DIRECTORY'." -} - -while getopts ":hd:" option; do - case $option in - h) # display help - Help - exit;; - d) # Enter a directory - out_dir=$OPTARG - if [ ! -d $out_dir ]; then - echo $out_dir Does not exist - exit - fi;; - \?) # Invalid option - echo "Error: Invalid option" - exit;; - esac -done - -# if no directory was specified, collect all directories from 'archive' -if [ -z $out_dir ]; then - out_dirs=$(ls -rd archive/output*[0-9] 2>/dev/null) -else - out_dirs=$out_dir -fi - -# process each output directory -for dir in ${out_dirs[@]}; do - # process each mom6 file - for current_file in $dir/access-om3.mom6.*.nc*; do - if [ -f $current_file ]; then - new_filename=$(echo $current_file | sed -E 's/_([0-9]{4})/\1/') - # rename the file without overwriting existing files - mv -n $current_file $new_filename - fi - done -done diff --git a/payu_config/archive_scripts/standardise_mom_filenames.py b/payu_config/archive_scripts/standardise_mom_filenames.py new file mode 100644 index 00000000..f53cd764 --- /dev/null +++ b/payu_config/archive_scripts/standardise_mom_filenames.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# Copyright 2024 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. +# SPDX-License-Identifier: Apache-2.0. +# +# Standardise file naming for MOM output files in access-om by removing the underscore +# before date/time suffixes and replacing subsequent underscores with hyphens, e.g., +# replacing '_YYYY_MM' with 'YYYY-MM'. +# This was written assuming it would be used as a payu "userscript" at the "archive" stage, +# but alternatively a path to an "archive" directory can be provided. +# For more details, see https://github.com/COSIMA/om3-scripts/issues/32 + +import argparse +import glob +import os +import re +import sys + + +def standardised_filename(path): + """Return the standardised filename for a MOM output file, or None if no rename is needed. + + Files ending in *_[_...].nc[...] are renamed so that the first + underscore before the digit groups is removed and subsequent ones are replaced with '-'. + For example: + file._2023.nc -> file.2023.nc + file._2023_01.nc -> file.2023-01.nc + file._2023_01_15.nc -> file.2023-01-15.nc + """ + dirpath = os.path.dirname(path) + basename = os.path.basename(path) + + # Split the .nc extension + ext_match = re.match(r"^(.*?)(\.nc.*)$", basename) + if not ext_match: + return None + stem, ext = ext_match.groups() + + # Match the trailing block of underscore-separated digit groups + suffix_match = re.match(r"^(.*?)(_\d+(?:_\d+)*)$", stem) + if not suffix_match: + return None + base, suffix = suffix_match.groups() + + # Remove the leading underscore and replace any remaining ones with '-' + new_suffix = suffix[1:].replace("_", "-") + + new_basename = base + new_suffix + ext + if new_basename == basename: + return None + + return os.path.join(dirpath, new_basename) + + +def main(): + parser = argparse.ArgumentParser( + description="Standardise file naming for MOM output files." + ) + parser.add_argument( + "-d", + metavar="DIRECTORY", + dest="out_dir", + help="Process files in the specified experiment 'DIRECTORY'.", + ) + args = parser.parse_args() + + if args.out_dir: + if not os.path.isdir(args.out_dir): + print(f"{args.out_dir} does not exist") + sys.exit(1) + out_dirs = [args.out_dir] + else: + out_dirs = sorted(glob.glob("archive/output*[0-9]"), reverse=True) + + # Support ACCESS-OM3 and ACCESS-OM2 output files + file_patterns = [ + "access-om3.mom6.*.nc*", + "ocean/access-om2.mom5.*.nc*", + ] + + for dir_path in out_dirs: + for pattern in file_patterns: + for current_file in glob.glob(f"{dir_path}/{pattern}"): + if not os.path.isfile(current_file): + continue + new_file = standardised_filename(current_file) + if new_file is None: + continue + if os.path.exists(new_file): + print(f"Skipping {current_file}: {new_file} already exists") + continue + os.rename(current_file, new_file) + + +if __name__ == "__main__": + main() diff --git a/test/test_payu_conf/test_mom6_filenames.py b/test/test_payu_conf/test_mom6_filenames.py deleted file mode 100644 index a71da939..00000000 --- a/test/test_payu_conf/test_mom6_filenames.py +++ /dev/null @@ -1,179 +0,0 @@ -import pytest -import pandas as pd - -from os import makedirs, chdir -from subprocess import run -from pathlib import Path - -scripts_base = Path(__file__).parents[2] -run_str = f"{scripts_base}/payu_config/archive_scripts/standardise_mom6_filenames.sh" - -DIAG_BASE = "access-om3.mom6.h.test" - - -def assert_file_exists(p): - if not Path(p).resolve().is_file(): - raise AssertionError("File does not exist: %s" % str(p)) - - -def assert_f_not_exists(p): - if Path(p).resolve().is_file(): - raise AssertionError("File exists and should not: %s" % str(p)) - - -def yearly_files(dir_name, n, tmp_path, splits=0): - """ - Make empty data files with `splits` option which will create split files - such as `access-om3.mom6.h.test._2010.nc.0001` ... `.nc.000N` for each year. - - if `splits` is 0, then it will create files like `access-om3.mom6.h.test._2010.nc`, - otherwise, it will create files like `access-om3.mom6.h.test._2010.nc.0001`, - `access-om3.mom6.h.test._2010.nc.0002`, etc. - """ - - times = pd.date_range("2010-01-01", freq="YE", periods=n) - - out_dir = str(tmp_path) + "/" + dir_name + "/" - paths = [] - - for t in times: - year = t.year - if splits: - for i in range(1, splits + 1): - paths.append(f"{out_dir}{DIAG_BASE}._{year}.nc.{str(i).zfill(4)}") - else: - paths.append(f"{out_dir}{DIAG_BASE}._{str(year)}.nc") - - makedirs(out_dir) - - for p in paths: - with open(p, "w") as f: - f.close() - - for p in paths: - assert_file_exists(p) - - return paths - - -# Add `splits` parameter -@pytest.mark.parametrize( - "hist_dir, use_dir, n, splits", - [ - ("archive/output000", False, 12, 0), - ("archive/output999", False, 1, 5), - ("archive/output9999", False, 1, 2), - ("archive/output574", True, 12, 3), - ], -) # run this test with a several folder names and lengths, provide the directory as an argument sometimes -def test_true_case(hist_dir, use_dir, n, tmp_path, splits): - - yearly_paths = yearly_files(hist_dir, n, tmp_path, splits) - chdir(tmp_path) - output_dir = Path(yearly_paths[0]).parents[0] - - if not use_dir: # default path - run([run_str]) - else: # provide path - run( - [ - run_str, - "-d", - output_dir, - ], - ) - - expected_years = pd.date_range("2010-01-01", freq="YE", periods=n + 1) - - expected_paths = [] - for t in expected_years: - year = t.year - if splits: - for i in range(1, splits + 1): - expected_paths.append( - f"{output_dir}/{DIAG_BASE}.{year}.nc.{str(i).zfill(4)}" - ) - else: - expected_paths.append(f"{output_dir}/{DIAG_BASE}.{str(year)}.nc") - - for p in expected_paths[0:n]: - assert_file_exists(p) - - for p in expected_paths[n]: - assert_f_not_exists(p) - - for p in yearly_paths: - assert_f_not_exists(p) - - -@pytest.mark.parametrize( - "hist_dir, use_dir, n", - [ - ("archive/output000", False, 12), - ], -) -def test_dont_override(hist_dir, use_dir, n, tmp_path): - """ - make some empty data files, and make some files where the files should be renamed to, - and confirm it doesn't delete any of them - """ - - yearly_paths = yearly_files(hist_dir, n, tmp_path) - chdir(tmp_path) - output_dir = Path(yearly_paths[0]).parents[0] - - # write the expected output too - expected_years = pd.date_range("2010-01-01", freq="YE", periods=n) - - expected_paths = [ - f"{output_dir}/{DIAG_BASE}.{str(t)[0:4]}.nc" for t in expected_years - ] - - for p in expected_paths: - with open(p, "w") as f: - f.close() - - if not use_dir: # default path - run([run_str]) - else: # provide path - run( - [ - run_str, - "-d", - output_dir, - ], - ) - - for p in expected_paths: - assert_file_exists(p) - - for p in yearly_paths: - assert_file_exists(p) - - -# @pytest.mark.parametrize("hist_dir, ndays", [("Default", 31), ("Default", 27)]) -# def test_no_override(hist_dir, ndays, hist_base, tmp_path): -# """ -# Run the script to convert the daily data into monthly files, but the output filename already exists, and check nothing happens. -# """ - -# daily_paths = daily_files(hist_dir, hist_base, ndays, tmp_path) - -# chdir(tmp_path) -# output_dir = Path(daily_paths[0]).parents[0] - -# expected_months = pd.date_range("2010-01-01", freq="ME", periods=1) - -# monthly_paths = [ -# f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months -# ] -# for p in monthly_paths: -# Path(p).touch() - -# run([run_str]) - -# for p in daily_paths: -# assert_file_exists(p) - -# for p in monthly_paths: -# assert_file_exists(p) diff --git a/test/test_payu_conf/test_mom_filenames.py b/test/test_payu_conf/test_mom_filenames.py new file mode 100644 index 00000000..fff8ee0f --- /dev/null +++ b/test/test_payu_conf/test_mom_filenames.py @@ -0,0 +1,149 @@ +# Copyright 2024 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. +# SPDX-License-Identifier: Apache-2.0. + +import pytest +from os import chdir +from pathlib import Path +from subprocess import run + +scripts_base = Path(__file__).parents[2] +run_cmd = [ + "python3", + str(scripts_base / "payu_config/archive_scripts/standardise_mom_filenames.py"), +] + +MOM6_DIAG_BASE = "access-om3.mom6.test" +MOM5_DIAG_BASE = "access-om2.mom5.test" + + +def assert_file_exists(p): + if not Path(p).resolve().is_file(): + raise AssertionError(f"File does not exist: {p}") + + +def assert_file_not_exists(p): + if Path(p).resolve().is_file(): + raise AssertionError(f"File exists and should not: {p}") + + +def make_files(out_dir, diag_base, suffixes, splits=0): + """Create empty test files and return their paths. + + Creates '{diag_base}.{suffix}.nc' for each suffix. If splits > 0, creates split + files '{diag_base}.{suffix}.nc.0001' ... '.nc.{splits:04d}' instead. + """ + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + paths = [] + for suffix in suffixes: + if splits: + for i in range(1, splits + 1): + paths.append(out_dir / f"{diag_base}.{suffix}.nc.{i:04d}") + else: + paths.append(out_dir / f"{diag_base}.{suffix}.nc") + for p in paths: + p.touch() + return [str(p) for p in paths] + + +def standardised_path(orig_path, suffix): + """Return the expected path after standardisation of the given suffix. + + The first underscore in the suffix is removed and any remaining ones replaced + with '-', e.g. suffix '_2010_01' becomes '2010-01'. + """ + p = Path(orig_path) + new_suffix = suffix[1:].replace("_", "-") + new_name = p.name.replace(f".{suffix}.", f".{new_suffix}.", 1) + return str(p.parent / new_name) + + +@pytest.mark.parametrize( + "hist_dir, diag_base, file_subdir, use_dir, suffixes, splits", + [ + # ACCESS-OM3 + ( + "archive/output000", + MOM6_DIAG_BASE, + "", + False, + ["_2010", "_2011_01", "_2012_01_01", "_2012_01_01_000000"], + 0, + ), + ( + "archive/output9999", + MOM6_DIAG_BASE, + "", + True, + ["_2010", "_2011_01", "_2012_01_01", "_2012_01_01_000000"], + 3, + ), + # ACCESS-OM2 + ( + "archive/output000", + MOM5_DIAG_BASE, + "ocean", + False, + ["_2010", "_2011_01", "_2012_01_01", "_2012_01_01_000000"], + 0, + ), + ( + "archive/output9999", + MOM5_DIAG_BASE, + "ocean", + True, + ["_2010", "_2011_01", "_2012_01_01", "_2012_01_01_000000"], + 3, + ), + ], +) +def test_rename(hist_dir, diag_base, file_subdir, use_dir, suffixes, splits, tmp_path): + output_dir = tmp_path / hist_dir + file_dir = output_dir / file_subdir if file_subdir else output_dir + + original_paths = make_files(str(file_dir), diag_base, suffixes, splits) + chdir(tmp_path) + + if use_dir: + run(run_cmd + ["-d", str(output_dir)]) + else: + run(run_cmd) + + for orig_path in original_paths: + name = Path(orig_path).name + suffix = next(s for s in suffixes if f".{s}." in name) + assert_file_exists(standardised_path(orig_path, suffix)) + assert_file_not_exists(orig_path) + + +@pytest.mark.parametrize( + "hist_dir, diag_base, file_subdir, suffixes", + [ + ("archive/output000", MOM6_DIAG_BASE, "", ["_2010", "_2011_01", "_2012_01_01"]), + ( + "archive/output000", + MOM5_DIAG_BASE, + "ocean", + ["_2010", "_2011_01", "_2012_01_01"], + ), + ], +) +def test_dont_override(hist_dir, diag_base, file_subdir, suffixes, tmp_path): + """Check that existing target files are not overwritten.""" + output_dir = tmp_path / hist_dir + file_dir = output_dir / file_subdir if file_subdir else output_dir + + original_paths = make_files(str(file_dir), diag_base, suffixes) + chdir(tmp_path) + + # Create files at the expected destination paths so there is something to protect + expected_paths = [standardised_path(p, s) for p, s in zip(original_paths, suffixes)] + for p in expected_paths: + Path(p).touch() + + run(run_cmd) + + for p in expected_paths: + assert_file_exists(p) + for p in original_paths: + assert_file_exists(p) From 4874a0d346f3fc8977e2046da8b63a7151fc078f Mon Sep 17 00:00:00 2001 From: Dougie Squire Date: Tue, 16 Jun 2026 16:23:34 +1000 Subject: [PATCH 2/2] Apply suggestion from code review --- payu_config/archive_scripts/standardise_mom_filenames.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/payu_config/archive_scripts/standardise_mom_filenames.py b/payu_config/archive_scripts/standardise_mom_filenames.py index f53cd764..18015a36 100644 --- a/payu_config/archive_scripts/standardise_mom_filenames.py +++ b/payu_config/archive_scripts/standardise_mom_filenames.py @@ -69,7 +69,7 @@ def main(): sys.exit(1) out_dirs = [args.out_dir] else: - out_dirs = sorted(glob.glob("archive/output*[0-9]"), reverse=True) + out_dirs = sorted(glob.glob("archive/output*"), reverse=True) # Support ACCESS-OM3 and ACCESS-OM2 output files file_patterns = [