看SQL的执行日志的时候经常会出现这样一条日志如下图,今天来结合源码分析一下:

CLICKHOUSE的镜像仓库 clickhouse prewhere_CLICKHOUSE的镜像仓库

 

1、PREWHERE 关键字介绍

PREWHERE Clause

This clause has the same meaning as the WHERE clause. The difference is in which data is read from the table. When using PREWHERE, first only the columns necessary for executing PREWHERE are read. Then the other columns are read that are needed for running the query, but only those blocks where the PREWHERE expression is true.

It makes sense to use PREWHERE if there are filtration conditions that are used by a minority of the columns in the query, but that provide strong data filtration. This reduces the volume of data to read.

For example, it is useful to write PREWHERE for queries that extract a large number of columns, but that only have filtration for a few columns.

PREWHERE is only supported by tables from the *MergeTree family.

A query may simultaneously specify PREWHERE and WHERE. In this case, PREWHERE precedes WHERE.

If the 'optimize_move_to_prewhere' setting is set to 1 and PREWHERE is omitted, the system uses heuristics to automatically move parts of expressions from WHERE to PREWHERE.

2、源码分析

2.1 前期优化条件判断

源头方法在这里,只有在允许优化(optimize_move_to_prewhere=1) 且 where条件存在 且 prewhere条件不存在 且 query.final()=false 时会进行优化

auto optimize_prewhere = [&](auto &merge_tree) {
                SelectQueryInfo query_info;
                query_info.query = query_ptr;
                query_info.syntax_analyzer_result = syntax_analyzer_result;
                query_info.sets = query_analyzer->getPreparedSets();

                /// Try transferring some condition from WHERE to PREWHERE if enabled and viable
                // 允许优化 且 where条件存在 且 prewhere条件不存在 且 query.final()=false 时会进行优化
                // query.final()=false可能指table_expression为空或modifier=false
                if (settings.optimize_move_to_prewhere && query.where() && !query.prewhere() && !query.final())
                    MergeTreeWhereOptimizer{query_info, context, merge_tree, query_analyzer->getRequiredSourceColumns(), log};
            };

            if (const MergeTreeData *merge_tree_data = dynamic_cast<const MergeTreeData *>(storage.get()))
                optimize_prewhere(*merge_tree_data);

2.2 中期准备

MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
            SelectQueryInfo &query_info,
            const Context &context,
            const MergeTreeData &data,
            const Names &queried_columns,
            Logger *log)
            : table_columns{ext::map<std::unordered_set>(data.getColumns().getAllPhysical(),
                                                         [](const NameAndTypePair &col) { return col.name; })},
              queried_columns{queried_columns},
              block_with_constants{
                      KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result,
                                                          context)},
              log{log} {
        if (!data.primary_key_columns.empty())
            first_primary_key_column = data.primary_key_columns[0];

        calculateColumnSizes(data, queried_columns);
        determineArrayJoinedNames(query_info.query->as<ASTSelectQuery &>());
        //具体的移动优化操作
        optimize(query_info.query->as<ASTSelectQuery &>());
    }

 

2.3 最终优化

void MergeTreeWhereOptimizer::optimize(ASTSelectQuery &select) const {
        if (!select.where() || select.prewhere())
            return;

        Conditions where_conditions = analyze(select.where());//分析where()条件, 生成std::list<Condition>这个对象
        Conditions prewhere_conditions;

        UInt64 total_size_of_moved_conditions = 0;//移动的condition的size()

        // depend on the same set of columns. columns完全相同时才做移动优化操作???
        // 这里应该是定义了一个move_condition(), 供后面调用
        /// Move condition and all other conditions depend on the same set of columns.
        auto move_condition = [&](Conditions::iterator cond_it) {
            prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, cond_it);
            total_size_of_moved_conditions += cond_it->columns_size;

            /// Move all other conditions that depend on the same set of columns.
            for (auto jt = where_conditions.begin(); jt != where_conditions.end();) {
                if (jt->columns_size == cond_it->columns_size && jt->identifiers == cond_it->identifiers)
                    prewhere_conditions.splice(prewhere_conditions.end(), where_conditions, jt++);
                else
                    ++jt;
            }
        };

        /// Move conditions unless the ratio of total_size_of_moved_conditions to the total_size_of_queried_columns is less than some threshold.
        while (!where_conditions.empty()) {
            auto it = std::min_element(where_conditions.begin(), where_conditions.end());//这个最小值是依据什么判断的???好像是按字典序排列的, 具体看Condition

            /// Move the best condition to PREWHERE if it is viable.
            if (!it->viable)//只有viable=true的才可能移动, 否则就跳出循环了. 因为上面取的是std::min_element, 这时候如果viable=false, 说明list中的其他元素都是viable=false
                break;

            /// 10% ratio is just a guess.
            //最多移动总条件个数的10%, 否则移动太多过滤效果也不会很好. 所以一般也就移动一个
            if (total_size_of_moved_conditions > 0 &&
                (total_size_of_moved_conditions + it->columns_size) * 10 > total_size_of_queried_columns)
                break;

            move_condition(it);//调用上面的方法执行移动操作
        }

        /// Nothing was moved.
        if (prewhere_conditions.empty())
            return;

        /// Rewrite the SELECT query.

        select.setExpression(ASTSelectQuery::Expression::WHERE, reconstruct(where_conditions));
        select.setExpression(ASTSelectQuery::Expression::PREWHERE, reconstruct(prewhere_conditions));

        LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition \"" << select.prewhere() << "\" moved to PREWHERE");
    }

注意:

需要看一下Condition的结构:

struct Condition {
            ASTPtr node;
            UInt64 columns_size = 0;
            NameSet identifiers;
            bool viable = false;
            bool good = false;

            auto tuple() const {
                return std::make_tuple(!viable, !good, columns_size);//注意这里定的是!viable, !good, 所以前面的方法中要取std::min_element
            }

            /// Is condition a better candidate for moving to PREWHERE?
            // 按字典顺序比较 tuple 中的值, 先比较tuple中的第一个元素, 相同的话比较第二个......
            //false --- 0 ; true --- 1;
            bool operator<(const Condition &rhs) const {
                return tuple() < rhs.tuple();  //如果有两个元素根据前面的规则不能判断大小, 则根据加入到list中的顺序比较, 后加入的较小
            }
        };

其中重写了小于号<操作符,即给出了Condition之间比较的规则。

columns_size是指根据列名name得到该列数据的大小(压缩后的数据的大小),为了读取更少的数据(更好的过滤效果),所以前面取得是Conditions中根据一定的规则比较后最小的Condition。

 

在最终优化阶段,“Conditions where_conditions = analyze(select.where());//分析where()条件, 生成std::list<Condition>这个对象”这个analyze()方法又是比较重要的方法,其具体实现是:

//分析where子句, 看哪些条件可以前移到prewhere子句中
    void MergeTreeWhereOptimizer::analyzeImpl(Conditions &res, const ASTPtr &node) const {
        if (const auto *func_and = node->as<ASTFunction>(); func_and && func_and->name == "and") {
            for (const auto &elem : func_and->arguments->children)
                analyzeImpl(res, elem);
        } else {
            Condition cond;
            cond.node = node;

            collectIdentifiersNoSubqueries(node, cond.identifiers);

            cond.viable =
                    /// Condition depend on some column. Constant expressions are not moved. 不移动常量表达式
                    !cond.identifiers.empty() //identifiers不能为空
                    && !cannotBeMoved(node) //不允许ARRAY JOIN, GLOBAL IN, GLOBAL NOT IN, indexHint等这些表达式 和 result of ARRAY JOIN 的列 移动到prewhere中
                    /// Do not take into consideration the conditions consisting only of the first primary key column
                    && !hasPrimaryKeyAtoms(node)// 不移动仅由一个主键列组成的表达式
                    /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded.
                    && isSubsetOfTableColumns(cond.identifiers)// 只考虑移动表列, 不移动数组连接的列. 注意: 我们假设别名已扩展
                    /// Do not move conditions involving all queried columns.
                    && cond.identifiers.size() < queried_columns.size();//不要移动涉及所有查询列的条件

            if (cond.viable) {
                cond.columns_size = getIdentifiersColumnSize(cond.identifiers);
                cond.good = isConditionGood(node);
            }

            res.emplace_back(std::move(cond));
        }
    }

其中,“cond.good = isConditionGood(node); ”这个方法中涉及到一个阈值,默认threshold = 2;(注意: 即使返回false, 也还是可能会进行prewhere优化, 具体还要看后面的判断)

//算术操作符function->name != "equals"时, 直接返回false
    //取值不是UInt64、Int64和Float64三种类型, 直接返回false
    //其他情况将 取值 和 threshold=2 进行比较
    /// 注意: 即使返回false, 也还是可能会进行prewhere优化, 具体还要看后面的判断
    bool MergeTreeWhereOptimizer::isConditionGood(const ASTPtr &condition) const {
        const auto *function = condition->as<ASTFunction>();
        if (!function)
            return false;

        /** we are only considering conditions of form `equals(one, another)` or `one = another`, especially if either `one` or `another` is ASTIdentifier */
        // 只考虑取 等于equals 的情况
        if (function->name != "equals")
            return false;

        auto left_arg = function->arguments->children.front().get();//等号左边的参数: Identifier
        auto right_arg = function->arguments->children.back().get();//等号右边的参数: Literal

        /// try to ensure left_arg points to ASTIdentifier
        // 这一步是为了确保left_arg指向ASTIdentifier
        if (!left_arg->as<ASTIdentifier>() && right_arg->as<ASTIdentifier>())
            std::swap(left_arg, right_arg);

        if (left_arg->as<ASTIdentifier>()) {
            /// condition may be "good" if only right_arg is a constant and its value is outside the threshold
            // 如果只有right_arg是常量且该常量值超出threshold, 则condition may be "good"
            if (const auto *literal = right_arg->as<ASTLiteral>()) {
                const auto &field = literal->value;
                const auto type = field.getType();

                /// check the value with respect to threshold
                // 注意这里只比较取值是 UInt64、Int64和Float64三种类型的情况, 比较该值与threshold的关系
                // 取值是String的时候肯定返回false
                if (type == Field::Types::UInt64) {//对于UInt64类型, value > threshold, 则认为condition is "good"
                    const auto value = field.get<UInt64>();
                    return value > threshold;
                } else if (type == Field::Types::Int64) {//对于Int64类型, |value| > threshold, 则认为condition is "good"
                    const auto value = field.get<Int64>();
                    return value < -threshold || threshold < value;
                } else if (type == Field::Types::Float64) {//对于Float64类型, value <> threshold, 则认为condition is "good"
                    const auto value = field.get<Float64>();
                    return value < threshold || threshold < value;
                }
            }
        }

        return false;
    }