diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d915a8f1..1130b4a9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,6 +8,7 @@ env: CMAKE_VERSION: "~3.25.0" FORCE_COLOR: "YES" KART_S3_TEST_DATA_POINT_CLOUD: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz" + KART_S3_TEST_DATA_RASTER: "s3://kart-bring-your-own-data-poc/erorisk_si/*.tif" AWS_NO_SIGN_REQUEST: "1" jobs: diff --git a/kart/byod/importer.py b/kart/byod/importer.py index c4f0feef..854a29be 100644 --- a/kart/byod/importer.py +++ b/kart/byod/importer.py @@ -8,6 +8,7 @@ from kart.lfs_util import dict_to_pointer_file_bytes from kart.progress_util import progress_bar from kart.s3_util import expand_s3_glob +from kart.tile.tilename_util import PAM_SUFFIX L = logging.getLogger(__name__) @@ -71,11 +72,14 @@ def import_tiles_to_stream(self, stream, sources): continue # Tile hasn't been imported previously. - pointer_data = dict_to_pointer_file_bytes( - self.source_to_metadata[source]["tile"] - ) + tile_info = self.source_to_metadata[source]["tile"] + pointer_data = dict_to_pointer_file_bytes(tile_info) write_blob_to_stream(stream, blob_path, pointer_data) - + if "pamOid" in tile_info: + pam_data = dict_to_pointer_file_bytes( + {"oid": tile_info["pamOid"], "size": tile_info["pamSize"]} + ) + write_blob_to_stream(stream, blob_path + PAM_SUFFIX, pam_data) p.update(1) self.source_to_imported_metadata = self.source_to_metadata diff --git a/kart/byod/raster_import.py b/kart/byod/raster_import.py new file mode 100644 index 00000000..db26f08e --- /dev/null +++ b/kart/byod/raster_import.py @@ -0,0 +1,127 @@ +import logging + +import click + +from kart.byod.importer import ByodTileImporter +from kart.cli_util import StringFromFile, MutexOption, KartCommand +from kart.raster.import_ import RasterImporter +from kart.raster.metadata_util import extract_raster_tile_metadata +from kart.s3_util import get_hash_and_size_of_s3_object, fetch_from_s3 + + +L = logging.getLogger(__name__) + + +@click.command("byod-raster-import", hidden=True, cls=KartCommand) +@click.pass_context +@click.option( + "--message", + "-m", + type=StringFromFile(encoding="utf-8"), + help="Commit message. By default this is auto-generated.", +) +@click.option( + "--checkout/--no-checkout", + "do_checkout", + is_flag=True, + default=True, + help="Whether to create a working copy once the import is finished, if no working copy exists yet.", +) +@click.option( + "--replace-existing", + is_flag=True, + cls=MutexOption, + exclusive_with=["--delete", "--update-existing"], + help="Replace existing dataset at the same path.", +) +@click.option( + "--update-existing", + is_flag=True, + cls=MutexOption, + exclusive_with=["--replace-existing"], + help=( + "Update existing dataset at the same path. " + "Tiles will be replaced by source tiles with the same name. " + "Tiles in the existing dataset which are not present in SOURCES will remain untouched." + ), +) +@click.option( + "--delete", + type=StringFromFile(encoding="utf-8"), + cls=MutexOption, + exclusive_with=["--replace-existing"], + multiple=True, + help=("Deletes the given tile. Can be used multiple times."), +) +@click.option( + "--amend", + default=False, + is_flag=True, + help="Amend the previous commit instead of adding a new commit", +) +@click.option( + "--allow-empty", + is_flag=True, + default=False, + help=( + "Usually recording a commit that has the exact same tree as its sole " + "parent commit is a mistake, and the command prevents you from making " + "such a commit. This option bypasses the safety" + ), +) +@click.option( + "--num-workers", + "--num-processes", + type=click.INT, + help="How many import workers to run in parallel. Defaults to the number of available CPU cores.", + default=None, + hidden=True, +) +@click.option("--dataset-path", "--dataset", help="The dataset's path once imported") +@click.argument( + "sources", + nargs=-1, + metavar="SOURCE [SOURCES...]", +) +def byod_raster_import( + ctx, + message, + do_checkout, + replace_existing, + update_existing, + delete, + amend, + allow_empty, + num_workers, + dataset_path, + sources, +): + """ + Experimental. Import a dataset of raster tiles from S3. Doesn't fetch the tiles, does store the tiles original location. + + SOURCES should be one or more GeoTIFF files (or wildcards that match multiple GeoTIFF files). + """ + repo = ctx.obj.repo + + ByodRasterImporter(repo, ctx).import_tiles( + convert_to_cloud_optimized=False, + dataset_path=dataset_path, + message=message, + do_checkout=do_checkout, + replace_existing=replace_existing, + update_existing=update_existing, + delete=delete, + amend=amend, + allow_empty=allow_empty, + sources=list(sources), + num_workers=num_workers, + ) + + +class ByodRasterImporter(ByodTileImporter, RasterImporter): + def extract_tile_metadata(self, tile_location): + oid_and_size = get_hash_and_size_of_s3_object(tile_location) + result = extract_raster_tile_metadata(tile_location, oid_and_size=oid_and_size) + # TODO - format still not definite, we might not put the whole URL in here. + result["tile"]["url"] = tile_location + return result diff --git a/kart/cli.py b/kart/cli.py index 4f2eda1f..5d3b1a13 100755 --- a/kart/cli.py +++ b/kart/cli.py @@ -55,6 +55,7 @@ "install": {"install"}, "add_dataset": {"add-dataset"}, "byod.point_cloud_import": {"byod-point-cloud-import"}, + "byod.raster_import": {"byod-raster-import"}, } # These commands aren't valid Python symbols, even when we change dash to underscore. diff --git a/kart/raster/metadata_util.py b/kart/raster/metadata_util.py index 3cb4811a..f161cac6 100644 --- a/kart/raster/metadata_util.py +++ b/kart/raster/metadata_util.py @@ -5,11 +5,14 @@ from xml.dom import minidom from xml.dom.minidom import Node +from botocore.exceptions import ClientError + from kart.crs_util import normalise_wkt from kart.geometry import ring_as_wkt from kart.list_of_conflicts import ListOfConflicts from kart.lfs_util import get_hash_and_size_of_file from kart.raster.validate_cloud_optimized_geotiff import validate as validate_cogtiff +from kart.s3_util import fetch_from_s3, get_error_code from kart.schema import Schema, ColumnSchema from kart.tile.tilename_util import find_similar_files_case_insensitive, PAM_SUFFIX @@ -110,7 +113,7 @@ def _rewrite_format(format_json, rewrite_metadata): return format_json -def extract_raster_tile_metadata(raster_tile_path): +def extract_raster_tile_metadata(raster_tile_path, oid_and_size=None): """ Use gdalinfo to get any and all raster metadata we can make use of in Kart. This includes metadata that must be dataset-homogenous and would be stored in the dataset's /meta/ folder, @@ -131,11 +134,13 @@ def extract_raster_tile_metadata(raster_tile_path): """ from osgeo import gdal - metadata = gdal.Info(str(raster_tile_path), options=["-json", "-norat", "-noct"]) - - # NOTE: this format is still in early stages of design, is subject to change. + gdal_path_str = str(raster_tile_path) + if gdal_path_str.startswith("s3://"): + gdal_path_str = gdal_path_str.replace("s3://", "/vsis3/") + metadata = gdal.Info(gdal_path_str, options=["-json", "-norat", "-noct"]) - warnings, errors, details = validate_cogtiff(str(raster_tile_path), full_check=True) + full_check = not gdal_path_str.startswith("/vsi") + warnings, errors, details = validate_cogtiff(gdal_path_str, full_check=full_check) is_cog = not errors format_json = { @@ -149,7 +154,10 @@ def extract_raster_tile_metadata(raster_tile_path): cc = metadata["cornerCoordinates"] size_in_pixels = metadata["size"] - oid, size = get_hash_and_size_of_file(raster_tile_path) + if oid_and_size: + oid, size = oid_and_size + else: + oid, size = get_hash_and_size_of_file(raster_tile_path) # Keep tile info keys in alphabetical order, except oid and size should be last. tile_info = { @@ -171,34 +179,7 @@ def extract_raster_tile_metadata(raster_tile_path): "tile": tile_info, } - try: - raster_tile_path = Path(raster_tile_path) - expected_pam_path = raster_tile_path.with_name( - raster_tile_path.name + PAM_SUFFIX - ) - pams = find_similar_files_case_insensitive(expected_pam_path) - if len(pams) == 1: - pam_path = pams[0] - result.update(extract_aux_xml_metadata(pam_path)) - - if pam_path.name == expected_pam_path.name: - tile_info.update({"pamName": pam_path.name}) - else: - tile_info.update( - {"pamSourceName": pam_path.name, "pamName": expected_pam_path.name} - ) - - pam_oid, pam_size = get_hash_and_size_of_file(pam_path) - tile_info.update( - { - "pamOid": f"sha256:{pam_oid}", - "pamSize": pam_size, - } - ) - except Exception as e: - # TODO - how to handle corrupted PAM file. - L.warning("Error extracting aux-xml metadata %s", e) - + _find_and_add_pam_info(raster_tile_path, result) return result @@ -253,6 +234,56 @@ def gdalinfo_band_to_kart_columnschema(gdalinfo_band): return ColumnSchema(result) +def _find_and_add_pam_info(raster_tile_path, raster_tile_metadata): + tile_info = raster_tile_metadata["tile"] + + if str(raster_tile_path).startswith("s3://"): + try: + pam_path = fetch_from_s3(str(raster_tile_path) + PAM_SUFFIX) + raster_tile_metadata.update(extract_aux_xml_metadata(pam_path)) + pam_oid, pam_size = get_hash_and_size_of_file(pam_path) + tile_info.update( + { + "pamOid": f"sha256:{pam_oid}", + "pamSize": pam_size, + } + ) + pam_path.unlink() + return + except ClientError as e: + if get_error_code(e) != 404: + L.warning("Error extracting aux-xml metadata: %s", e) + return + + try: + raster_tile_path = Path(raster_tile_path) + expected_pam_path = raster_tile_path.with_name( + raster_tile_path.name + PAM_SUFFIX + ) + pams = find_similar_files_case_insensitive(expected_pam_path) + if len(pams) == 1: + pam_path = pams[0] + raster_tile_metadata.update(extract_aux_xml_metadata(pam_path)) + + if pam_path.name == expected_pam_path.name: + tile_info.update({"pamName": pam_path.name}) + else: + tile_info.update( + {"pamSourceName": pam_path.name, "pamName": expected_pam_path.name} + ) + + pam_oid, pam_size = get_hash_and_size_of_file(pam_path) + tile_info.update( + { + "pamOid": f"sha256:{pam_oid}", + "pamSize": pam_size, + } + ) + except Exception as e: + # TODO - how to handle corrupted PAM file. + L.warning("Error extracting aux-xml metadata: %s", e) + + def extract_aux_xml_metadata(aux_xml_path): """ Given the path to a tif.aux.xml file, tries to extract the following: diff --git a/kart/s3_util.py b/kart/s3_util.py index f9d94279..2f664931 100644 --- a/kart/s3_util.py +++ b/kart/s3_util.py @@ -104,3 +104,12 @@ def get_hash_and_size_of_s3_object(s3_url): sha256 = standard_b64decode(response["ChecksumSHA256"]).hex() size = response["ContentLength"] return sha256, size + + +def get_error_code(client_error): + response = getattr(client_error, "response") + error = response.get("Error") if response else None + code = error.get("Code") if error else None + if code and code.isdigit(): + code = int(code) + return code diff --git a/tests/byod/test_imports.py b/tests/byod/test_imports.py index c241b9cf..85688712 100644 --- a/tests/byod/test_imports.py +++ b/tests/byod/test_imports.py @@ -11,7 +11,6 @@ def test_byod_point_cloud_import( cli_runner, s3_test_data_point_cloud, ): - # Using postgres here because it has the best type preservation repo_path = tmp_path / "point-cloud-repo" r = cli_runner.invoke(["init", repo_path]) assert r.exit_code == 0 @@ -67,3 +66,68 @@ def test_byod_point_cloud_import( "oid": "sha256:6b980ce4d7f4978afd3b01e39670e2071a792fba441aca45be69be81cb48b08c", "size": 51489, } + + +@pytest.mark.slow +def test_byod_raster_import( + tmp_path, + chdir, + cli_runner, + s3_test_data_raster, +): + repo_path = tmp_path / "point-cloud-repo" + r = cli_runner.invoke(["init", repo_path]) + assert r.exit_code == 0 + + with chdir(repo_path): + r = cli_runner.invoke( + [ + "byod-raster-import", + s3_test_data_raster, + "--dataset-path=erorisk_si", + ] + ) + assert r.exit_code == 0, r.stderr + + r = cli_runner.invoke(["data", "ls"]) + assert r.exit_code == 0, r.stderr + assert r.stdout.splitlines() == ["erorisk_si"] + + r = cli_runner.invoke(["show", "-o", "json"]) + assert r.exit_code == 0, r.stderr + output = json.loads(r.stdout) + erorisk_si = output["kart.diff/v1+hexwkb"]["erorisk_si"] + assert erorisk_si["meta"]["schema.json"]["+"] == [ + { + "dataType": "integer", + "size": 8, + "description": "erorisk_si", + "interpretation": "palette", + "unsigned": True, + } + ] + assert erorisk_si["meta"]["band/1/categories.json"]["+"] == { + "1": "High landslide risk - delivery to stream", + "2": "High landslide risk - non-delivery to steam", + "3": "Moderate earthflow risk", + "4": "Severe earthflow risk", + "5": "Gully risk", + } + + tile_url = os.path.join( + s3_test_data_raster.split("*")[0], "erorisk_silcdb4.tif" + ) + + assert erorisk_si["tile"][0]["+"] == { + "name": "erorisk_silcdb4.tif", + "crs84Extent": "POLYGON((172.6754107 -43.7555641,172.6748326 -43.8622096,172.8170036 -43.8625257,172.8173289 -43.755879,172.6754107 -43.7555641,172.6754107 -43.7555641))", + "dimensions": "762x790", + "format": "geotiff/cog", + "nativeExtent": "POLYGON((1573869.73 5155224.347,1573869.73 5143379.674,1585294.591 5143379.674,1585294.591 5155224.347,1573869.73 5155224.347))", + "url": tile_url, + "oid": "sha256:c4bbea4d7cfd54f4cdbca887a1b358a81710e820a6aed97cdf3337fd3e14f5aa", + "size": 604652, + "pamName": "erorisk_silcdb4.tif.aux.xml", + "pamOid": "sha256:d8f514e654a81bdcd7428886a15e300c56b5a5ff92898315d16757562d2968ca", + "pamSize": 36908, + } diff --git a/tests/conftest.py b/tests/conftest.py index 9a534249..69666803 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1209,6 +1209,21 @@ def s3_test_data_point_cloud(monkeypatch_session): return os.environ["KART_S3_TEST_DATA_POINT_CLOUD"] +@pytest.fixture() +def s3_test_data_raster(monkeypatch_session): + """ + You can run tests that fetch a copy of the erosion test data from S3 (and so test Kart's S3 behaviour) + by setting KART_S3_TEST_DATA_RASTER=s3://some-bucket/path-to-erosion-tiles/*.tif + The data hosted there should be the data found at tests/data/raster/cog-erosion.tgz + """ + if "KART_S3_TEST_DATA_RASTER" not in os.environ: + raise pytest.skip( + "S3 tests require configuration - read docstring at conftest.s3_test_data_raster" + ) + _restore_aws_config_during_testing() + return os.environ["KART_S3_TEST_DATA_RASTER"] + + @pytest.fixture() def dodgy_restore(cli_runner): """ diff --git a/tests/point_cloud/test_imports.py b/tests/point_cloud/test_imports.py index 8ab428c1..f213e363 100644 --- a/tests/point_cloud/test_imports.py +++ b/tests/point_cloud/test_imports.py @@ -147,7 +147,6 @@ def test_import_several_laz__convert( requires_git_lfs, check_tile_is_reflinked, ): - # Using postgres here because it has the best type preservation with data_archive_readonly("point-cloud/laz-auckland.tgz") as auckland: repo_path = tmp_path / "point-cloud-repo" r = cli_runner.invoke(["init", repo_path])