Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds an experimental byod-raster-import command... #909

Merged
merged 2 commits into from
Oct 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ env:
CMAKE_VERSION: "~3.25.0"
FORCE_COLOR: "YES"
KART_S3_TEST_DATA_POINT_CLOUD: "s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/*.laz"
KART_S3_TEST_DATA_RASTER: "s3://kart-bring-your-own-data-poc/erorisk_si/*.tif"
AWS_NO_SIGN_REQUEST: "1"

jobs:
Expand Down
12 changes: 8 additions & 4 deletions kart/byod/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from kart.lfs_util import dict_to_pointer_file_bytes
from kart.progress_util import progress_bar
from kart.s3_util import expand_s3_glob
from kart.tile.tilename_util import PAM_SUFFIX

L = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,11 +72,14 @@ def import_tiles_to_stream(self, stream, sources):
continue

# Tile hasn't been imported previously.
pointer_data = dict_to_pointer_file_bytes(
self.source_to_metadata[source]["tile"]
)
tile_info = self.source_to_metadata[source]["tile"]
pointer_data = dict_to_pointer_file_bytes(tile_info)
write_blob_to_stream(stream, blob_path, pointer_data)

if "pamOid" in tile_info:
pam_data = dict_to_pointer_file_bytes(
{"oid": tile_info["pamOid"], "size": tile_info["pamSize"]}
)
write_blob_to_stream(stream, blob_path + PAM_SUFFIX, pam_data)
p.update(1)

self.source_to_imported_metadata = self.source_to_metadata
Expand Down
127 changes: 127 additions & 0 deletions kart/byod/raster_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import logging

import click

from kart.byod.importer import ByodTileImporter
from kart.cli_util import StringFromFile, MutexOption, KartCommand
from kart.raster.import_ import RasterImporter
from kart.raster.metadata_util import extract_raster_tile_metadata
from kart.s3_util import get_hash_and_size_of_s3_object, fetch_from_s3


L = logging.getLogger(__name__)


@click.command("byod-raster-import", hidden=True, cls=KartCommand)
@click.pass_context
@click.option(
"--message",
"-m",
type=StringFromFile(encoding="utf-8"),
help="Commit message. By default this is auto-generated.",
)
@click.option(
"--checkout/--no-checkout",
"do_checkout",
is_flag=True,
default=True,
help="Whether to create a working copy once the import is finished, if no working copy exists yet.",
)
@click.option(
"--replace-existing",
is_flag=True,
cls=MutexOption,
exclusive_with=["--delete", "--update-existing"],
help="Replace existing dataset at the same path.",
)
@click.option(
"--update-existing",
is_flag=True,
cls=MutexOption,
exclusive_with=["--replace-existing"],
help=(
"Update existing dataset at the same path. "
"Tiles will be replaced by source tiles with the same name. "
"Tiles in the existing dataset which are not present in SOURCES will remain untouched."
),
)
@click.option(
"--delete",
type=StringFromFile(encoding="utf-8"),
cls=MutexOption,
exclusive_with=["--replace-existing"],
multiple=True,
help=("Deletes the given tile. Can be used multiple times."),
)
@click.option(
"--amend",
default=False,
is_flag=True,
help="Amend the previous commit instead of adding a new commit",
)
@click.option(
"--allow-empty",
is_flag=True,
default=False,
help=(
"Usually recording a commit that has the exact same tree as its sole "
"parent commit is a mistake, and the command prevents you from making "
"such a commit. This option bypasses the safety"
),
)
@click.option(
"--num-workers",
"--num-processes",
type=click.INT,
help="How many import workers to run in parallel. Defaults to the number of available CPU cores.",
default=None,
hidden=True,
)
@click.option("--dataset-path", "--dataset", help="The dataset's path once imported")
@click.argument(
"sources",
nargs=-1,
metavar="SOURCE [SOURCES...]",
)
def byod_raster_import(
ctx,
message,
do_checkout,
replace_existing,
update_existing,
delete,
amend,
allow_empty,
num_workers,
dataset_path,
sources,
):
"""
Experimental. Import a dataset of raster tiles from S3. Doesn't fetch the tiles, does store the tiles original location.

SOURCES should be one or more GeoTIFF files (or wildcards that match multiple GeoTIFF files).
"""
repo = ctx.obj.repo

ByodRasterImporter(repo, ctx).import_tiles(
convert_to_cloud_optimized=False,
dataset_path=dataset_path,
message=message,
do_checkout=do_checkout,
replace_existing=replace_existing,
update_existing=update_existing,
delete=delete,
amend=amend,
allow_empty=allow_empty,
sources=list(sources),
num_workers=num_workers,
)


class ByodRasterImporter(ByodTileImporter, RasterImporter):
def extract_tile_metadata(self, tile_location):
oid_and_size = get_hash_and_size_of_s3_object(tile_location)
result = extract_raster_tile_metadata(tile_location, oid_and_size=oid_and_size)
# TODO - format still not definite, we might not put the whole URL in here.
result["tile"]["url"] = tile_location
return result
1 change: 1 addition & 0 deletions kart/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"install": {"install"},
"add_dataset": {"add-dataset"},
"byod.point_cloud_import": {"byod-point-cloud-import"},
"byod.raster_import": {"byod-raster-import"},
}

# These commands aren't valid Python symbols, even when we change dash to underscore.
Expand Down
99 changes: 65 additions & 34 deletions kart/raster/metadata_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
from xml.dom import minidom
from xml.dom.minidom import Node

from botocore.exceptions import ClientError

from kart.crs_util import normalise_wkt
from kart.geometry import ring_as_wkt
from kart.list_of_conflicts import ListOfConflicts
from kart.lfs_util import get_hash_and_size_of_file
from kart.raster.validate_cloud_optimized_geotiff import validate as validate_cogtiff
from kart.s3_util import fetch_from_s3, get_error_code
from kart.schema import Schema, ColumnSchema
from kart.tile.tilename_util import find_similar_files_case_insensitive, PAM_SUFFIX

Expand Down Expand Up @@ -110,7 +113,7 @@ def _rewrite_format(format_json, rewrite_metadata):
return format_json


def extract_raster_tile_metadata(raster_tile_path):
def extract_raster_tile_metadata(raster_tile_path, oid_and_size=None):
"""
Use gdalinfo to get any and all raster metadata we can make use of in Kart.
This includes metadata that must be dataset-homogenous and would be stored in the dataset's /meta/ folder,
Expand All @@ -131,11 +134,13 @@ def extract_raster_tile_metadata(raster_tile_path):
"""
from osgeo import gdal

metadata = gdal.Info(str(raster_tile_path), options=["-json", "-norat", "-noct"])

# NOTE: this format is still in early stages of design, is subject to change.
gdal_path_str = str(raster_tile_path)
if gdal_path_str.startswith("s3://"):
gdal_path_str = gdal_path_str.replace("s3://", "/vsis3/")
metadata = gdal.Info(gdal_path_str, options=["-json", "-norat", "-noct"])

warnings, errors, details = validate_cogtiff(str(raster_tile_path), full_check=True)
full_check = not gdal_path_str.startswith("/vsi")
warnings, errors, details = validate_cogtiff(gdal_path_str, full_check=full_check)
is_cog = not errors

format_json = {
Expand All @@ -149,7 +154,10 @@ def extract_raster_tile_metadata(raster_tile_path):

cc = metadata["cornerCoordinates"]
size_in_pixels = metadata["size"]
oid, size = get_hash_and_size_of_file(raster_tile_path)
if oid_and_size:
oid, size = oid_and_size
else:
oid, size = get_hash_and_size_of_file(raster_tile_path)

# Keep tile info keys in alphabetical order, except oid and size should be last.
tile_info = {
Expand All @@ -171,34 +179,7 @@ def extract_raster_tile_metadata(raster_tile_path):
"tile": tile_info,
}

try:
raster_tile_path = Path(raster_tile_path)
expected_pam_path = raster_tile_path.with_name(
raster_tile_path.name + PAM_SUFFIX
)
pams = find_similar_files_case_insensitive(expected_pam_path)
if len(pams) == 1:
pam_path = pams[0]
result.update(extract_aux_xml_metadata(pam_path))

if pam_path.name == expected_pam_path.name:
tile_info.update({"pamName": pam_path.name})
else:
tile_info.update(
{"pamSourceName": pam_path.name, "pamName": expected_pam_path.name}
)

pam_oid, pam_size = get_hash_and_size_of_file(pam_path)
tile_info.update(
{
"pamOid": f"sha256:{pam_oid}",
"pamSize": pam_size,
}
)
except Exception as e:
# TODO - how to handle corrupted PAM file.
L.warning("Error extracting aux-xml metadata %s", e)

_find_and_add_pam_info(raster_tile_path, result)
return result


Expand Down Expand Up @@ -253,6 +234,56 @@ def gdalinfo_band_to_kart_columnschema(gdalinfo_band):
return ColumnSchema(result)


def _find_and_add_pam_info(raster_tile_path, raster_tile_metadata):
tile_info = raster_tile_metadata["tile"]

if str(raster_tile_path).startswith("s3://"):
try:
pam_path = fetch_from_s3(str(raster_tile_path) + PAM_SUFFIX)
raster_tile_metadata.update(extract_aux_xml_metadata(pam_path))
pam_oid, pam_size = get_hash_and_size_of_file(pam_path)
tile_info.update(
{
"pamOid": f"sha256:{pam_oid}",
"pamSize": pam_size,
}
)
pam_path.unlink()
return
except ClientError as e:
if get_error_code(e) != 404:
L.warning("Error extracting aux-xml metadata: %s", e)
return

try:
raster_tile_path = Path(raster_tile_path)
expected_pam_path = raster_tile_path.with_name(
raster_tile_path.name + PAM_SUFFIX
)
pams = find_similar_files_case_insensitive(expected_pam_path)
if len(pams) == 1:
pam_path = pams[0]
raster_tile_metadata.update(extract_aux_xml_metadata(pam_path))

if pam_path.name == expected_pam_path.name:
tile_info.update({"pamName": pam_path.name})
else:
tile_info.update(
{"pamSourceName": pam_path.name, "pamName": expected_pam_path.name}
)

pam_oid, pam_size = get_hash_and_size_of_file(pam_path)
tile_info.update(
{
"pamOid": f"sha256:{pam_oid}",
"pamSize": pam_size,
}
)
except Exception as e:
# TODO - how to handle corrupted PAM file.
L.warning("Error extracting aux-xml metadata: %s", e)


def extract_aux_xml_metadata(aux_xml_path):
"""
Given the path to a tif.aux.xml file, tries to extract the following:
Expand Down
9 changes: 9 additions & 0 deletions kart/s3_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,12 @@ def get_hash_and_size_of_s3_object(s3_url):
sha256 = standard_b64decode(response["ChecksumSHA256"]).hex()
size = response["ContentLength"]
return sha256, size


def get_error_code(client_error):
response = getattr(client_error, "response")
error = response.get("Error") if response else None
code = error.get("Code") if error else None
if code and code.isdigit():
code = int(code)
return code
Loading
Loading