AWS: Switch to base2 entropy in ObjectStoreLocationProvider for optim…

…ized S3 performance Co-authored-by: Drew Schleit <[email protected]>
apache · Oct 1, 2024 · 0217f82 · 0217f82
1 parent 34cd01b
commit 0217f82
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 13 deletions.
diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java
@@ -27,7 +27,6 @@
 import org.apache.iceberg.relocated.com.google.common.hash.HashCode;
 import org.apache.iceberg.relocated.com.google.common.hash.HashFunction;
 import org.apache.iceberg.relocated.com.google.common.hash.Hashing;
-import org.apache.iceberg.relocated.com.google.common.io.BaseEncoding;
 import org.apache.iceberg.util.LocationUtil;
 import org.apache.iceberg.util.PropertyUtil;
 
@@ -108,10 +107,11 @@ public String newDataLocation(String filename) {
   static class ObjectStoreLocationProvider implements LocationProvider {
 
     private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed();
-    private static final BaseEncoding BASE64_ENCODER = BaseEncoding.base64Url().omitPadding();
-    private static final ThreadLocal<byte[]> TEMP = ThreadLocal.withInitial(() -> new byte[4]);
+    // the starting index of the lower 20-bits of a 32-bit binary string
+    private static final int HASH_BINARY_STRING_START_INDEX = 12;
     private final String storageLocation;
     private final String context;
+    private final boolean excludePartitionValues;
 
     ObjectStoreLocationProvider(String tableLocation, Map<String, String> properties) {
       this.storageLocation =
@@ -123,6 +123,11 @@ static class ObjectStoreLocationProvider implements LocationProvider {
       } else {
         this.context = pathContext(tableLocation);
       }
+      this.excludePartitionValues =
+          PropertyUtil.propertyAsBoolean(
+              properties,
+              TableProperties.WRITE_OBJECT_STORE_OMIT_PARTITION_VALUES,
+              TableProperties.WRITE_OBJECT_STORE_OMIT_PARTITION_VALUES_DEFAULT);
     }
 
     private static String dataLocation(Map<String, String> properties, String tableLocation) {
@@ -141,7 +146,12 @@ private static String dataLocation(Map<String, String> properties, String tableL
 
     @Override
     public String newDataLocation(PartitionSpec spec, StructLike partitionData, String filename) {
-      return newDataLocation(String.format("%s/%s", spec.partitionToPath(partitionData), filename));
+      if (excludePartitionValues) {
+        return newDataLocation(filename);
+      } else {
+        return newDataLocation(
+            String.format("%s/%s", spec.partitionToPath(partitionData), filename));
+      }
     }
 
     @Override
@@ -172,10 +182,12 @@ private static String pathContext(String tableLocation) {
     }
 
     private String computeHash(String fileName) {
-      byte[] bytes = TEMP.get();
-      HashCode hash = HASH_FUNC.hashString(fileName, StandardCharsets.UTF_8);
-      hash.writeBytesTo(bytes, 0, 4);
-      return BASE64_ENCODER.encode(bytes);
+      HashCode hashCode = HASH_FUNC.hashString(fileName, StandardCharsets.UTF_8);
+
+      // {@link Integer#toBinaryString} excludes leading zeros, which we want to preserve.
+      // force the first bit to be set to get around that.
+      String hashAsBinaryString = Integer.toBinaryString(hashCode.asInt() | Integer.MIN_VALUE);
+      return hashAsBinaryString.substring(HASH_BINARY_STRING_START_INDEX);
     }
   }
 }
diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java
@@ -244,6 +244,11 @@ private TableProperties() {}
   public static final String OBJECT_STORE_ENABLED = "write.object-storage.enabled";
   public static final boolean OBJECT_STORE_ENABLED_DEFAULT = false;
 
+  // Excludes the partition values in the path when set to true and object store is enabled
+  public static final String WRITE_OBJECT_STORE_OMIT_PARTITION_VALUES =
+      "write.object-storage.omit-partition-values";
+  public static final boolean WRITE_OBJECT_STORE_OMIT_PARTITION_VALUES_DEFAULT = false;
+
   /**
    * @deprecated Use {@link #WRITE_DATA_LOCATION} instead.
    */

diff --git a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java
@@ -240,7 +240,6 @@ public void testObjectStorageLocationProviderPathResolution() {
 
     String dataPath = "s3://random/data/location";
     table.updateProperties().set(TableProperties.WRITE_DATA_LOCATION, dataPath).commit();
-
     assertThat(table.locationProvider().newDataLocation("file"))
         .as("write data path should be used when set")
         .contains(dataPath);
@@ -304,4 +303,35 @@ public void testEncodedFieldNameInPartitionPath() {
 
     assertThat(partitionString).isEqualTo("data%231=val%231");
   }
+
+  @TestTemplate
+  public void testExcludePartitionInPath() {
+    // Update the table to use a string field for partitioning with special characters in the name
+    table.updateProperties().set(TableProperties.OBJECT_STORE_ENABLED, "true").commit();
+    table
+        .updateProperties()
+        .set(TableProperties.WRITE_OBJECT_STORE_OMIT_PARTITION_VALUES, "true")
+        .commit();
+
+    // Use a partition value that has a special character
+    StructLike partitionData = TestHelpers.CustomRow.of(0, "val");
+    String fileLocation =
+        table.locationProvider().newDataLocation(table.spec(), partitionData, "test.parquet");
+
+    // no partition values included in the path
+    assertThat(fileLocation).endsWith("/data/01101010001111101000/test.parquet");
+  }
+
+  @TestTemplate
+  public void testHashInjection() {
+    table.updateProperties().set(TableProperties.OBJECT_STORE_ENABLED, "true").commit();
+    assertThat(table.locationProvider().newDataLocation("a"))
+        .endsWith("/data/01010110100110110010/a");
+    assertThat(table.locationProvider().newDataLocation("b"))
+        .endsWith("/data/11100111111000000011/b");
+    assertThat(table.locationProvider().newDataLocation("c"))
+        .endsWith("/data/00101101011001011111/c");
+    assertThat(table.locationProvider().newDataLocation("d"))
+        .endsWith("/data/10010001010001110011/d");
+  }
 }
diff --git a/docs/docs/aws.md b/docs/docs/aws.md
@@ -343,7 +343,10 @@ Data stored in S3 with a traditional Hive storage layout can face S3 request thr
 
 Iceberg by default uses the Hive storage layout but can be switched to use the `ObjectStoreLocationProvider`. 
 With `ObjectStoreLocationProvider`, a deterministic hash is generated for each stored file, with the hash appended 
-directly after the `write.data.path`. This ensures files written to s3 are equally distributed across multiple [prefixes](https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/) in the S3 bucket. Resulting in minimized throttling and maximized throughput for S3-related IO operations. When using `ObjectStoreLocationProvider` having a shared and short `write.data.path` across your Iceberg tables will improve performance.
+directly after the `write.data.path`. This ensures files written to S3 are equally distributed across multiple
+[prefixes](https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/) in the S3 bucket;
+resulting in minimized throttling and maximized throughput for S3-related IO operations. When using `ObjectStoreLocationProvider`
+having a shared `write.data.path` across your Iceberg tables will improve performance.
 
 For more information on how S3 scales API QPS, check out the 2018 re:Invent session on [Best Practices for Amazon S3 and Amazon S3 Glacier](https://youtu.be/rHeTn9pHNKo?t=3219). At [53:39](https://youtu.be/rHeTn9pHNKo?t=3219) it covers how S3 scales/partitions & at [54:50](https://youtu.be/rHeTn9pHNKo?t=3290) it discusses the 30-60 minute wait time before new partitions are created.
 
@@ -357,7 +360,7 @@ CREATE TABLE my_catalog.my_ns.my_table (
 USING iceberg
 OPTIONS (
     'write.object-storage.enabled'=true, 
-    'write.data.path'='s3://my-table-data-bucket')
+    'write.data.path'='s3://my-table-data-bucket/my_table')
 PARTITIONED BY (category);
 ```
 
@@ -366,9 +369,11 @@ We can then insert a single row into this new table
 INSERT INTO my_catalog.my_ns.my_table VALUES (1, "Pizza", "orders");
 ```
 
-Which will write the data to S3 with a hash (`2d3905f8`) appended directly after the `write.object-storage.path`, ensuring reads to the table are spread evenly  across [S3 bucket prefixes](https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html), and improving performance.
+Which will write the data to S3 with a hash (`01010110100110110010`) appended directly after the `write.object-storage.path`, 
+ensuring reads to the table are spread evenly across [S3 bucket prefixes](https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html), and improving performance.
+Previously provided base64 hash was updated to base2 in order to provide an improved auto-scaling behavior on S3 General Purpose Buckets.
 ```
-s3://my-table-data-bucket/2d3905f8/my_ns.db/my_table/category=orders/00000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet
+s3://my-table-data-bucket/my_ns.db/my_table/01010110100110110010/category=orders/00000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet
 ```
 
 Note, the path resolution logic for `ObjectStoreLocationProvider` is `write.data.path` then `<tableLocation>/data`.
@@ -378,6 +383,12 @@ However, for the older versions up to 0.12.0, the logic is as follows:
 
 For more details, please refer to the [LocationProvider Configuration](custom-catalog.md#custom-location-provider-implementation) section.  
 
+We have also added a new table property `write.object-storage.omit-partition-values` that if set to true(default=false), this will
+omit the partition values from the file path. Iceberg does not need these values in the file path and setting this value to true
+can further reduce the key count. Inserted key would look like the following with this config set, note that `category=orders` is removed:
+```
+s3://my-table-data-bucket/my_ns.db/my_table/11010100101100111010/00000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet
+```
 ### S3 Strong Consistency
 
 In November 2020, S3 announced [strong consistency](https://aws.amazon.com/s3/consistency/) for all read operations, and Iceberg is updated to fully leverage this feature.

diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md
@@ -77,6 +77,7 @@ Iceberg tables support table properties to configure table behavior, like the de
 | write.metadata.previous-versions-max                 | 100                         | The max number of previous version metadata files to keep before deleting after commit                                                                                                            |
 | write.spark.fanout.enabled                           | false                       | Enables the fanout writer in Spark that does not require data to be clustered; uses more memory                                                                                                   |
 | write.object-storage.enabled                         | false                       | Enables the object storage location provider that adds a hash component to file paths                                                                                                             |
+| write.object-storage.omit-partition-values           | false                       | Excludes the partition values from the file path                                                                                                                                                  |
 | write.data.path                                      | table location + /data      | Base location for data files                                                                                                                                                                      |
 | write.metadata.path                                  | table location + /metadata  | Base location for metadata files                                                                                                                                                                  |
 | write.delete.mode                                    | copy-on-write               | Mode used for delete commands: copy-on-write or merge-on-read (v2 only)                                                                                                                           |