Created
December 18, 2020 04:47
-
-
Save ishepherd/f06deae5c06ecc040de0662a865de5a7 to your computer and use it in GitHub Desktop.
Test that will repro EndOfStreamException
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Fact] | |
public void Read_plain_dictionary_data_page_that_has_no_content() | |
{ | |
using (var reader = new ParquetReader(OpenTestFile("/special/data_page_without_content.parquet"), leaveStreamOpen: false)) | |
{ | |
// if the issue has not been fixed, this line will throw EndOfStreamException | |
DataColumn[] columns = reader.ReadEntireRowGroup(); | |
// Compare to ground truth from spark | |
// # PySpark commands to get the ground truth | |
// # In my repro file, the exception is thrown when processing a column called '_text' | |
// # so get some details for that column | |
// import pyspark.sql.functions as F | |
// df = spark.read.parquet('data_page_without_content.parquet') | |
// len(df.columns) | |
// df.count() | |
// df.withColumn('_text_len', F.length('_text')).groupby('_text_len').count().select('_text_len', 'count').collect() | |
// Check dimensions are correct | |
Assert.Equal(34, columns.Length); | |
IEnumerable<int> rowCount = columns.Select(c => c.Data.Length).Distinct(); | |
Assert.Equal(new[] { 457 }, rowCount); | |
// The "_text" column is where the EndOfStreamException used to be thrown | |
// Check its values approximately | |
DataColumn textCol = columns.Single(c => c.Field.Name == "_text"); | |
var histogram = from s in textCol.Data.Cast<string>() | |
group s by s?.Length into grp | |
select new { stringLength = grp.Key, count = grp.Count() }; | |
Assert.Equal(4, histogram.Count()); | |
Assert.Contains(new { stringLength = (int?)null, count = 387 }, histogram); | |
Assert.Contains(new { stringLength = (int?)3877, count = 1 }, histogram); | |
Assert.Contains(new { stringLength = (int?)9245, count = 1 }, histogram); | |
Assert.Contains(new { stringLength = (int?)17476, count = 68 }, histogram); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This gist belongs with this PR: aloneguid/parquet-dotnet#95
Quest staff:
The
data_page_without_content.parquet
is located in the MDS repo (as 'sqltempdbcontention_has_data_page_without_content.parquet'). It contains some implementation SQL of DS so we are not sharing it. See SCMM-3313