Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.PTFOperator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
Expand Down Expand Up @@ -1300,9 +1302,10 @@ private static void runTopNKeyOptimization(OptimizeTezProcContext procCtx)
return;
}

String topNKeyRegexPattern = buildTopNKeyRegexPattern(procCtx);
Map<SemanticRule, SemanticNodeProcessor> opRules = new LinkedHashMap<SemanticRule, SemanticNodeProcessor>();
opRules.put(
new RuleRegExp("Top n key optimization", ReduceSinkOperator.getOperatorName() + "%"),
new RuleRegExp("Top n key optimization", topNKeyRegexPattern),
new TopNKeyProcessor(
HiveConf.getIntVar(procCtx.conf, HiveConf.ConfVars.HIVE_MAX_TOPN_ALLOWED),
HiveConf.getFloatVar(procCtx.conf, ConfVars.HIVE_TOPN_EFFICIENCY_THRESHOLD),
Expand All @@ -1322,6 +1325,49 @@ private static void runTopNKeyOptimization(OptimizeTezProcContext procCtx)
ogw.startWalking(topNodes, null);
}

/*
* Build the ReduceSink matching pattern used by TopNKey optimization.
*
* For ORDER BY / LIMIT queries that do not involve GROUP BY or JOIN,
* applying TopNKey results in a performance regression. ReduceSink
* operators created only for ordering must therefore be excluded from
* TopNKey.
*
* When ORDER BY or LIMIT is present, restrict TopNKey to ReduceSink
* operators that originate from GROUP BY, JOIN, MAPJOIN, LATERAL VIEW
* JOIN or PTF query shapes
*/
private static String buildTopNKeyRegexPattern(OptimizeTezProcContext procCtx) {
String reduceSinkOp = ReduceSinkOperator.getOperatorName() + "%";

boolean hasOrderOrLimit =
procCtx.parseContext.getQueryProperties().hasLimit() ||
procCtx.parseContext.getQueryProperties().hasOrderBy();
Comment on lines +1343 to +1345
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this? It is usually better if we can keep the optimization/transformation rules independent of the SQL syntax.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is added for windowing queries to use topNkey Path - without group by / join in the query.
example: windowing_streaming.q
select * from ( select p_mfgr, rank() over(partition by p_mfgr order by p_name) r from part) a where r < 4


if (hasPTFReduceSink(procCtx) || !hasOrderOrLimit) {
return reduceSinkOp;
}

return "("
+ GroupByOperator.getOperatorName() + "|"
+ PTFOperator.getOperatorName() + "|"
+ JoinOperator.getOperatorName() + "|"
+ MapJoinOperator.getOperatorName() + "|"
+ LateralViewJoinOperator.getOperatorName() + "|"
+ CommonMergeJoinOperator.getOperatorName()
+ ").*%"
+ reduceSinkOp;
}

private static boolean hasPTFReduceSink(OptimizeTezProcContext ctx) {
for (ReduceSinkOperator rs : ctx.visitedReduceSinks) {
if (rs.getConf().isPTFReduceSink()) {
return true;
}
}
return false;
}

Comment on lines +1362 to +1370
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this achieves the desired outcome. Basically, if there is a PTF RS anywhere in the plan we will apply the rule on every RS (no matter if it is PTF or not).

Moreover, by relying on ctx.visitedReduceSinks we make the TopNKeyOptimization highly dependent on stats dependent optimization which is not great.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For windowing queries, since there is not much performance issues with TopNKey enabled, currently making the queries to use TopNkey Path. But to match the plan, there is no sequence of PTF%RS% patterns for some queries. only RS% will work for this case.
I chosed this approach, to avoid traversing the tree to check query has PTF operator.
can you suggest a solution for the windowing queries

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it a better solution to traverse the tree to find PTF ?

private boolean findParallelSemiJoinBranch(Operator<?> mapjoin, TableScanOperator bigTableTS,
ParseContext parseContext,
Map<ReduceSinkOperator, TableScanOperator> semijoins,
Expand Down
39 changes: 17 additions & 22 deletions ql/src/test/results/clientpositive/llap/autoColumnStats_4.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -74,22 +74,17 @@ STAGE PLANS:
Filter Operator
predicate: cint is not null (type: boolean)
Statistics: Num rows: 9173 Data size: 671202 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: cint (type: int)
null sort order: z
Statistics: Num rows: 9173 Data size: 671202 Basic stats: COMPLETE Column stats: COMPLETE
top n: 10
Select Operator
expressions: cint (type: int), CAST( cstring1 AS varchar(128)) (type: varchar(128))
outputColumnNames: _col0, _col1
Statistics: Num rows: 9173 Data size: 977184 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: int)
null sort order: z
sort order: +
Statistics: Num rows: 9173 Data size: 977184 Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col1 (type: varchar(128))
Select Operator
expressions: cint (type: int), CAST( cstring1 AS varchar(128)) (type: varchar(128))
outputColumnNames: _col0, _col1
Statistics: Num rows: 9173 Data size: 1479384 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: int)
null sort order: z
sort order: +
Statistics: Num rows: 9173 Data size: 1479384 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: varchar(128))
Execution mode: vectorized, llap
LLAP IO: all inputs
Reducer 2
Expand All @@ -98,27 +93,27 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: varchar(128))
outputColumnNames: _col0, _col1
Statistics: Num rows: 9173 Data size: 977184 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 9173 Data size: 1479384 Basic stats: COMPLETE Column stats: COMPLETE
Limit
Number of rows: 10
Statistics: Num rows: 10 Data size: 1296 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: 1728 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: int)
null sort order: a
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 10 Data size: 1296 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: 1728 Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col1 (type: varchar(128))
Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: varchar(128))
outputColumnNames: _col0, _col1
Statistics: Num rows: 10 Data size: 1296 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: 1728 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
Statistics: Num rows: 10 Data size: 1296 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: 1728 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
Expand All @@ -128,7 +123,7 @@ STAGE PLANS:
Select Operator
expressions: _col0 (type: int), _col1 (type: varchar(128))
outputColumnNames: a, b
Statistics: Num rows: 10 Data size: 1296 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: 1728 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: min(a), max(a), count(1), count(a), compute_bit_vector_hll(a), max(length(b)), avg(COALESCE(length(b),0)), count(b), compute_bit_vector_hll(b)
minReductionHashAggr: 0.9
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,17 +242,12 @@ STAGE PLANS:
expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: _col0 (type: string)
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 20 Data size: 1740 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 20 Data size: 1740 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Execution mode: vectorized, llap
LLAP IO: all inputs
Map 4
Expand All @@ -264,17 +259,12 @@ STAGE PLANS:
expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: _col0 (type: string)
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 20 Data size: 1740 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 20 Data size: 1740 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Execution mode: vectorized, llap
LLAP IO: all inputs
Reducer 3
Expand Down Expand Up @@ -719,26 +709,22 @@ STAGE PLANS:
TableScan
alias: a
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: key (type: string)
null sort order: z
Select Operator
expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Select Operator
expressions: key (type: string)
outputColumnNames: _col0
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Execution mode: vectorized, llap
LLAP IO: all inputs
Reducer 2
Expand All @@ -751,17 +737,12 @@ STAGE PLANS:
Limit
Number of rows: 5
Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: _col0 (type: string)
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Reducer 4
Execution mode: vectorized, llap
Reduce Operator Tree:
Expand Down Expand Up @@ -789,17 +770,12 @@ STAGE PLANS:
Limit
Number of rows: 5
Statistics: Num rows: 5 Data size: 435 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: _col0 (type: string)
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 10 Data size: 870 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Union 3
Vertex: Union 3

Expand Down
65 changes: 25 additions & 40 deletions ql/src/test/results/clientpositive/llap/cbo_input26.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,17 @@ STAGE PLANS:
alias: a
filterExpr: ((ds = '2008-04-08') and (hr = '11')) (type: boolean)
Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: key (type: string)
null sort order: z
Select Operator
expressions: key (type: string), value (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Select Operator
expressions: key (type: string), value (type: string)
outputColumnNames: _col0, _col1
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col1 (type: string)
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: string)
Execution mode: vectorized, llap
LLAP IO: all inputs
Map 4
Expand Down Expand Up @@ -196,21 +191,16 @@ STAGE PLANS:
alias: a
filterExpr: ((ds = '2008-04-08') and (hr = '11')) (type: boolean)
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: key (type: string)
null sort order: z
Select Operator
expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Select Operator
expressions: key (type: string)
outputColumnNames: _col0
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Execution mode: vectorized, llap
LLAP IO: all inputs
Map 4
Expand Down Expand Up @@ -354,21 +344,16 @@ STAGE PLANS:
alias: a
filterExpr: ((ds = '2008-04-08') and (hr = '11')) (type: boolean)
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
Top N Key Operator
sort order: +
keys: key (type: string)
null sort order: z
Select Operator
expressions: key (type: string)
outputColumnNames: _col0
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Select Operator
expressions: key (type: string)
outputColumnNames: _col0
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: z
sort order: +
Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE
TopN Hash Memory Usage: 0.1
Execution mode: vectorized, llap
LLAP IO: all inputs
Map 4
Expand Down
Loading