Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor cleanups on varbinary to varchar coercion #24274

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ public void encode(JsonGenerator generator, ConnectorSession session, Block bloc
return;
}
Slice slice = VARCHAR.getSlice(block, position);
generator.writeString(slice.toStringUtf8());
generator.writeUTF8String(slice.byteArray(), slice.byteArrayOffset(), slice.length());
}
}

Expand All @@ -287,7 +287,7 @@ public void encode(JsonGenerator generator, ConnectorSession session, Block bloc
return;
}
Slice slice = padSpaces(VARCHAR.getSlice(block, position), length);
generator.writeString(slice.toStringUtf8());
generator.writeUTF8String(slice.byteArray(), slice.byteArrayOffset(), slice.length());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import java.util.HexFormat;

import static io.trino.plugin.hive.HiveStorageFormat.ORC;
import static io.trino.plugin.hive.HiveStorageFormat.PARQUET;
import static io.trino.spi.type.VarbinaryType.VARBINARY;
import static io.trino.spi.type.Varchars.truncateToLength;
import static java.nio.charset.CodingErrorAction.REPLACE;
Expand All @@ -41,9 +40,6 @@ public static TypeCoercer<VarbinaryType, VarcharType> createVarbinaryToVarcharCo
if (storageFormat == ORC) {
return new OrcVarbinaryToVarcharCoercer(toType);
}
if (storageFormat == PARQUET) {
return new ParquetVarbinaryToVarcharCoercer(toType);
}
return new VarbinaryToVarcharCoercer(toType);
}

Expand All @@ -55,31 +51,6 @@ public VarbinaryToVarcharCoercer(VarcharType toType)
super(VARBINARY, toType);
}

@Override
protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position)
{
try {
Slice decodedValue = fromType.getSlice(block, position);
if (toType.isUnbounded()) {
toType.writeSlice(blockBuilder, decodedValue);
return;
}
toType.writeSlice(blockBuilder, truncateToLength(decodedValue, toType.getBoundedLength()));
}
catch (RuntimeException e) {
blockBuilder.appendNull();
}
}
}

private static class ParquetVarbinaryToVarcharCoercer
extends TypeCoercer<VarbinaryType, VarcharType>
{
public ParquetVarbinaryToVarcharCoercer(VarcharType toType)
{
super(VARBINARY, toType);
}

@Override
protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import static io.trino.plugin.hive.HiveStorageFormat.ORC;
import static io.trino.plugin.hive.HiveStorageFormat.PARQUET;
import static io.trino.plugin.hive.HiveStorageFormat.RCTEXT;
import static io.trino.plugin.hive.HiveTimestampPrecision.DEFAULT_PRECISION;
import static io.trino.plugin.hive.coercions.CoercionUtils.createCoercer;
import static io.trino.plugin.hive.util.HiveTypeTranslator.toHiveType;
Expand All @@ -45,35 +44,6 @@ public class TestVarbinaryToVarcharCoercer

@Test
public void testVarbinaryToVarcharCoercion()
{
assertVarbinaryToVarcharCoercion(Slices.utf8Slice("abc"), VARBINARY, Slices.utf8Slice("abc"), VARCHAR);
assertVarbinaryToVarcharCoercion(Slices.utf8Slice("abc"), VARBINARY, Slices.utf8Slice("ab"), createVarcharType(2));
// Invalid UTF-8 encoding
assertVarbinaryToVarcharCoercion(Slices.wrappedBuffer(X_CHAR, CONTINUATION_BYTE), VARBINARY, Slices.wrappedBuffer(X_CHAR, CONTINUATION_BYTE), VARCHAR);
assertVarbinaryToVarcharCoercion(
Slices.wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE),
VARBINARY,
Slices.wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE),
VARCHAR);
assertVarbinaryToVarcharCoercion(
Slices.wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR),
VARBINARY,
Slices.wrappedBuffer(X_CHAR, START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, X_CHAR),
VARCHAR);
assertVarbinaryToVarcharCoercion(
Slices.wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xA0, (byte) 0x80),
VARBINARY,
Slices.wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xA0, (byte) 0x80),
VARCHAR);
assertVarbinaryToVarcharCoercion(
Slices.wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xBF, (byte) 0xBF),
VARBINARY,
Slices.wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xBF, (byte) 0xBF),
VARCHAR);
}

@Test
public void testVarbinaryToVarcharCoercionForParquet()
{
assertVarbinaryToVarcharCoercionForParquet(Slices.utf8Slice("abc"), VARBINARY, "abc", VARCHAR);
assertVarbinaryToVarcharCoercionForParquet(Slices.utf8Slice("abc"), VARBINARY, "ab", createVarcharType(2));
Expand All @@ -98,11 +68,6 @@ public void testVarbinaryToVarcharCoercionForOrc()
assertVarbinaryToVarcharCoercionForOrc(Slices.wrappedBuffer(X_CHAR, (byte) 0b11101101, (byte) 0xBF, (byte) 0xBF), VARBINARY, "58 ed bf bf", VARCHAR);
}

private static void assertVarbinaryToVarcharCoercion(Slice actualValue, Type fromType, Slice expectedValue, Type toType)
{
assertVarbinaryToVarcharCoercion(actualValue, fromType, expectedValue, toType, RCTEXT);
}

private static void assertVarbinaryToVarcharCoercionForOrc(Slice actualValue, Type fromType, String expectedValue, Type toType)
{
assertVarbinaryToVarcharCoercion(actualValue, fromType, Slices.utf8Slice(expectedValue), toType, ORC);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,35 +211,38 @@ protected void doTestHiveCoercion(HiveTableDefinition tableDefinition)
}
String prestoSelectQuery = format("SELECT %s FROM %s", String.join(", ", prestoReadColumns), tableName);
assertQueryResults(Engine.TRINO, prestoSelectQuery, expectedPrestoResults, prestoReadColumns, 2);
List<Object> hexRepresentedValue = ImmutableList.of("58F7BFBFBF", "58F7BFBFBF58");

if (tableName.toLowerCase(ENGLISH).contains("orc")) {
hexRepresentedValue = ImmutableList.of("3538206637206266206266206266", "3538206637206266206266206266203538");
}
// Additional assertions for VARBINARY coercion
if (prestoReadColumns.contains("binary_to_string")) {
List<Object> hexRepresentedValue = ImmutableList.of("58EFBFBDEFBFBDEFBFBDEFBFBD", "58EFBFBDEFBFBDEFBFBDEFBFBD58");

// TODO: Translate internal byte sequence of Varchar in sync with Hive when coercing from Varbinary column
if (tableName.toLowerCase(ENGLISH).contains("parquet") && !tableName.toLowerCase(ENGLISH).contains("_unpartitioned")) {
hexRepresentedValue = ImmutableList.of("58EFBFBDEFBFBDEFBFBDEFBFBD", "58EFBFBDEFBFBDEFBFBDEFBFBD58");
}
if (tableName.toLowerCase(ENGLISH).contains("orc")) {
hexRepresentedValue = ImmutableList.of("3538206637206266206266206266", "3538206637206266206266206266203538");
}

// Additional assertions for VARBINARY coercion
assertQueryResults(
Engine.TRINO,
format("SELECT to_hex(cast(binary_to_string as varbinary)) as hex_representation FROM %s", tableName),
ImmutableMap.of("hex_representation", hexRepresentedValue),
ImmutableList.of("hex_representation"),
2);
assertQueryResults(
Engine.TRINO,
format("SELECT to_hex(cast(binary_to_string as varbinary)) as hex_representation FROM %s", tableName),
ImmutableMap.of("hex_representation", hexRepresentedValue),
ImmutableList.of("hex_representation"),
2);
}

// For Hive, remove unsupported columns for the current file format and hive version
List<String> hiveReadColumns = removeUnsupportedColumnsForHive(allColumns, tableName);
Map<String, List<Object>> expectedHiveResults = expected.apply(Engine.HIVE);
String hiveSelectQuery = format("SELECT %s FROM %s", String.join(", ", hiveReadColumns), tableName);
assertQueryResults(Engine.HIVE, hiveSelectQuery, expectedHiveResults, hiveReadColumns, 2);

List<Object> hexRepresentedValue = ImmutableList.of("58F7BFBFBF", "58F7BFBFBF58");

// TODO: Translate internal byte sequence of Varchar in sync with Hive when coercing from Varbinary column
if (tableName.toLowerCase(ENGLISH).contains("parquet")) {
hexRepresentedValue = ImmutableList.of("58EFBFBDEFBFBDEFBFBDEFBFBD", "58EFBFBDEFBFBDEFBFBDEFBFBD58");
}
else if (tableName.toLowerCase(ENGLISH).contains("orc")) {
hexRepresentedValue = ImmutableList.of("3538206637206266206266206266", "3538206637206266206266206266203538");
}

// Additional assertions for VARBINARY coercion
assertQueryResults(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ protected Map<ColumnContext, String> expectedExceptionsWithTrinoContext()
.put(columnContext("parquet", "timestamp_map_to_map"), "Unsupported Trino column type (varchar) for Parquet column ([timestamp_map_to_map, key_value, value, timestamp2string] optional int96 timestamp2string)")
.put(columnContext("parquet", "string_to_timestamp"), "Unsupported Trino column type (timestamp(3)) for Parquet column ([string_to_timestamp] optional binary string_to_timestamp (STRING))")
.put(columnContext("parquet", "timestamp_to_date"), "Unsupported Trino column type (date) for Parquet column ([timestamp_to_date] optional int96 timestamp_to_date))")
.put(columnContext("parquet", "binary_to_string"), "Varbinary to Varchar coercion doesn't match with Hive's implementation") // Error message is not matched, this is added for our understanding
.put(columnContext("parquet", "binary_to_smaller_varchar"), "Varbinary to Varchar coercion doesn't match with Hive's implementation") // Error message is not matched, this is added for our understanding
.buildOrThrow();
}

Expand Down