场景描述:公司项目某一模块是数据源之间数据迁移,场景之一就是从Mysql中读取数据写入到其他的数据源,架构采用的的flume+kafa的形式,Kafka作为Channel,然后通过自定Source和Sink,将数据在不同的数据源之间迁移,而且效率还比较高,也不会丢数据;

整理项目知识点的过程中打算写一下博客,结合一些机构的资料简单写一下自定义MysqlSource;

主要考虑的几点:

1、实时监控mysql数据库某张表的数据变化;

2、根据offset和maxRecords去查询数据;

3、将数据封装成event,写入Channel;

4、写入成功之后更新offset,失败回滚(这里就不说回滚的问题,Source这里也没什么要说的在Sink那里着重考虑)

maven依赖:

<dependency>
    <groupId>org.apache.flume</groupId>
    <artifactId>flume-ng-core</artifactId>
    <version>1.9.0</version>
</dependency>

<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.27</version>
</dependency>

maven中添加打包的插件:

<build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                    <archive>
                        <manifest>
                            <mainClass></mainClass>
                        </manifest>
                    </archive>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

为了方便操作定义MySQLSourceHandler

import org.apache.flume.Context;
import org.apache.flume.conf.ConfigurationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.sql.*;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

public class MySQLSQLSourceHandler {
    private static final Logger LOG = LoggerFactory.getLogger(MySQLSQLSourceHandler.class);
    private Integer runQueryDelay,//每两次查询的时间间隔
                    startFrom,//开始id
                    currentIndex,//当前id
                    recordSize,//每次查询返回结果的条数
                    maxRow;//每次查询的最大条数

    private String table,//要操作的表
                   columnsToSelect,//用户传入的查询的样例
                   customQuery,//用户传入的查询语句
                   query,//构建的查询语句
                   defaultCharsetResultSet;//编码集

    //上下文 用来获取配置文件
    private Context context;

    //为定义的变量赋值(默认值),可在flume任务的配置文件中修改
    private static final Integer DEFAULT_QUERY_DELAY = 10000;
    private static final Integer DEFAULT_START_VALUE=0;
    private static final Integer DEFAULT_MAX_ROWS = 5000;
    private static final String DEFAULT_COLUMNS_SELECT = "*";
    private static final String DEFAULT_CHARSET_RESULTSET = "UTF-8";

    private static Connection conn = null;
    private static PreparedStatement ps = null;
    private static String connectionURL,connectionUserName,connectionPassword;

    //加载静态资源
    static{
        try {
            Properties prop = new Properties();
            prop.load(MySQLSQLSourceHandler
                    .class.getClassLoader()
                    .getResourceAsStream("jdbc.properties"));
            connectionURL = prop.getProperty("dbUrl");
            connectionUserName = prop.getProperty("dbUser");
            connectionPassword = prop.getProperty("dbPassword");
            Class.forName(prop.getProperty("dbDriver"));

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }

    //获取jdbc连接
    private static Connection InitConnection(String url,String user,String password){

        try {
            Connection conn = DriverManager.getConnection(url,user,password);
            if (conn == null)
                throw new SQLException();
            return conn;
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return null;
    }

    //构造方法
    MySQLSQLSourceHandler (Context context) throws ParseException {
        //初始化上下文
        this.context = context;

        //有默认值参数:获取flume任务配置文件中的参数,读不到的采用默认值
        this.columnsToSelect = context.getString("columns.to.select", DEFAULT_COLUMNS_SELECT);
        this.runQueryDelay = context.getInteger("run.query.delay", DEFAULT_QUERY_DELAY);
        this.startFrom = context.getInteger("start.from", DEFAULT_START_VALUE);
        this.maxRow = context.getInteger("max.row", DEFAULT_MAX_ROWS);
        this.defaultCharsetResultSet = context.getString("default.charset.resultset", DEFAULT_CHARSET_RESULTSET);


        //无默认值参数:获取flume任务配置文件中的参数
        this.table = context.getString("table");
        this.customQuery = context.getString("custom.query");
        connectionURL = context.getString("connection.url");
        connectionUserName = context.getString("connection.user");
        connectionPassword = context.getString("connection.password");
        conn = InitConnection(connectionURL, connectionUserName, connectionPassword);

        //校验相应的配置信息,如果没有默认值的参数也没赋值,抛出异常
        checkMandatoryProperties();
        //获取当前的id
        currentIndex = getStatusDBIndex(startFrom);
        //构建查询语句
        query = buildQuery();
    }

    //校验相应的配置信息(表,查询语句以及数据库连接的参数)
    private void checkMandatoryProperties() {
        if (table == null) {
            throw new ConfigurationException("property table not set");
        }
        if (connectionURL == null) {
            throw new ConfigurationException("connection.url property not set");
        }
        if (connectionUserName == null) {
            throw new ConfigurationException("connection.user property not set");
        }
        if (connectionPassword == null) {
            throw new ConfigurationException("connection.password property not set");
        }
    }

    //构建sql语句
    private String buildQuery() {
        String sql = "";
        //获取当前id
        currentIndex = getStatusDBIndex(startFrom);
        LOG.info(currentIndex + "");
        if (customQuery == null) {
            sql = "SELECT " + columnsToSelect + " FROM " + table;
        } else {
            sql = customQuery;
        }

        StringBuilder execSql = new StringBuilder(sql);
        //以id作为offset
        if (!sql.contains("where")) {
            execSql.append(" where ");
            execSql.append("id").append(">").append(currentIndex);
            execSql.append(" and id").append("<=").append(currentIndex+maxRow);
            LOG.info("execSql:" + execSql.toString() );
            return execSql.toString();
        } else {
            String  oldSql = execSql.toString();
            int num = KMPFunction.evaluate(oldSql,"where");
            String no_where = oldSql.substring(0,num);
            execSql = new StringBuilder(no_where);
            execSql.append(" where ");
            execSql.append("id").append(">").append(currentIndex);
            execSql.append(" and id").append("<=").append(currentIndex+maxRow);
            LOG.info("execSql:" + execSql.toString() );
            return execSql.toString();
        }
    }

    //执行查询
    List<List<Object>> executeQuery() {
        try {
            //每次执行查询时都要重新生成sql,因为id不同
            customQuery = buildQuery();
            //存放结果的集合
            List<List<Object>> results = new ArrayList<>();
            if (ps == null) {
                //
                ps = conn.prepareStatement(customQuery);
            }
            ResultSet result = ps.executeQuery(customQuery);
            while (result.next()) {
                //存放一条数据的集合(多个列)
                List<Object> row = new ArrayList<>();
                //将返回结果放入集合
                for (int i = 1; i <= result.getMetaData().getColumnCount(); i++) {
                    row.add(result.getObject(i));
                }
                results.add(row);
            }
            LOG.info("execSql:" + customQuery + "\nresultSize:" + results.size());
            return results;
        } catch (SQLException e) {
            LOG.error(e.toString());
            // 重新连接
            conn = InitConnection(connectionURL, connectionUserName, connectionPassword);
        }
        return null;
    }

    //将结果集转化为字符串,每一条数据是一个list集合,将每一个小的list集合转化为字符串
    List<String> getAllRows(List<List<Object>> queryResult) {
        List<String> allRows = new ArrayList<>();
        if (queryResult == null || queryResult.isEmpty())
            return allRows;
        StringBuilder row = new StringBuilder();
        for (List<Object> rawRow : queryResult) {
            Object value = null;
            for (Object aRawRow : rawRow) {
                value = aRawRow;
                if (value == null) {
                    row.append(",");
                } else {
                    row.append(aRawRow.toString()).append(",");
                }
            }
            allRows.add(row.toString());
            row = new StringBuilder();
        }
        return allRows;
    }

    //更新offset元数据状态,每次返回结果集后调用。必须记录每次查询的offset值,为程序中断续跑数据时使用,以id为offset
    void updateOffset2DB(int size) {
        //以source_tab做为KEY,如果不存在则插入,存在则更新(每个源表对应一条记录)
        String sql = "insert into flume_meta(source_tab,currentIndex) VALUES('"
                + this.table
                + "','" + (currentIndex += size)
                + "') on DUPLICATE key update source_tab=values(source_tab),currentIndex=values(currentIndex)";
        LOG.info("updateStatus Sql:" + sql);

        execSql(sql);
    }

    //执行sql语句
    private void execSql(String sql) {
        try {
            ps = conn.prepareStatement(sql);
            LOG.info("exec:" + sql);
            ps.execute();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    //获取当前id的offset
    private Integer getStatusDBIndex(int startFrom) {
        //从flume_meta表中查询出当前的id是多少
        String dbIndex = queryOne("select currentIndex from flume_meta where source_tab='" + table + "'");
        if (dbIndex != null) {
            return Integer.parseInt(dbIndex);
        }
        //如果没有数据,则说明是第一次查询或者数据表中还没有存入数据,返回最初传入的值
        return startFrom;
    }

    //查询一条数据的执行语句(当前id)
    private String queryOne(String sql) {
        ResultSet result = null;
        try {
            ps = conn.prepareStatement(sql);
            result = ps.executeQuery();
            while (result.next()) {
                return result.getString(1);
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return null;
    }

    //关闭相关资源
    void close() {
        try {
            ps.close();
            conn.close();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    int getCurrentIndex() {
        return currentIndex;
    }

    void setCurrentIndex(int newValue) {
        currentIndex = newValue;
    }

    int getRunQueryDelay() {
        return runQueryDelay;
    }

    String getQuery() {
        return query;
    }

    String getConnectionURL() {
        return connectionURL;
    }

    private boolean isCustomQuerySet() {
        return (customQuery != null);
    }

    Context getContext() {
        return context;
    }

    public String getConnectionUserName() {
        return connectionUserName;
    }

    public String getConnectionPassword() {
        return connectionPassword;
    }

    String getDefaultCharsetResultSet() {
        return defaultCharsetResultSet;
    }


}

处理sql的sql这里写了一个方法

public class KMPFunction{
    @Test
    public void test(){
        System.out.println(evaluate("abdsdsedabdwedsdweabe","sed"));
    }

    public static int evaluate(String fullStr,String subStr) {
        if(fullStr == null){
            return 0;
        }
        fullStr = fullStr.toLowerCase();
        return kmp(fullStr, subStr, kmpNext(subStr));
    }

    private static int kmp(String source, String dest, int[] next) {
        System.out.println(source);
        int num = 0;
        for (int i = 0, j = 0; i < source.length(); i++) {
            while (j > 0 && source.charAt(i) != dest.charAt(j)) {
                j = next[j - 1];
            }
            if (source.charAt(i) == dest.charAt(j)) {
                j++;
            }
            if (j == dest.length()) {
                num = i - j + 1;
                break;
            }
        }
        return num;
    }

    /**
     * 子串的部分匹配表,相当于子串与子串自己做了一次kmp算法
     * @param dest
     * @return
     */
    private static int[] kmpNext(String dest) {
        int[] next = new int[dest.length()];
        next[0] = 0;
        for (int i = 1, j = 0; i < dest.length(); i++) {
            while (j > 0 && dest.charAt(i) != dest.charAt(j)) {
                j = next[j - 1];
            }
            if (dest.charAt(i) == dest.charAt(j)) {
                j++;
            }
            next[i] = j;
        }
        return next;
    }
    
}

接下来就是自定义MySQLSource

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.PollableSource;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.SimpleEvent;
import org.apache.flume.source.AbstractSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

public class MySQLSource extends AbstractSource implements Configurable, PollableSource {
    //打印日志
    private static final Logger LOG = LoggerFactory.getLogger(MySQLSource.class);
    //定义sqlHelper
    private MySQLSQLSourceHandler sqlSourceHandler;

    @Override
    public void configure(Context context) {
        try {
            //初始化
            sqlSourceHandler= new MySQLSQLSourceHandler (context);
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }


    @Override
    public Status process() throws EventDeliveryException {
        try {
            //查询数据表
            List<List<Object>> result = sqlSourceHandler.executeQuery();
            //存放event的集合
            List<Event> events = new ArrayList<>();
            //存放event头集合
            HashMap<String, String> header = new HashMap<>();
            //如果有返回数据,则将数据封装了event
            if (!result.isEmpty()) {
                List<String> allRows = sqlSourceHandler.getAllRows(result);
                Event event = null;
                for (String row : allRows) {
                    event = new SimpleEvent();
                    event.setBody(row.getBytes());
                    event.setHeaders(header);
                    events.add(event);
                }
                //将event写入到Channel
                this.getChannelProcessor().processEventBatch(events);
                //更新数据表的偏移量
                LOG.info("offset:"+result.size());
                sqlSourceHandler.updateOffset2DB(result.size());
            }
            //等待时长
            Thread.sleep(sqlSourceHandler.getRunQueryDelay());
            return Status.READY;
        }catch (InterruptedException e){
            LOG.error("Error processing row",e);
            return Status.BACKOFF;
        }
    }

    @Override
    public synchronized void stop(){
        LOG.info("Stopping sql source {} ...",getName());
        try{
            //关闭资源
            sqlSourceHandler.close();
        } finally {
          super.stop();
        }
    }


    @Override
    public long getBackOffSleepIncrement() {
        return 0;
    }

    @Override
    public long getMaxBackOffSleepInterval() {
        return 0;
    }


}

通过mvn  package打包上传到 $FLUME_HOME/lib目录下

编写MysqlSource.conf

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = com.zyz.flume.code.MySQLSource
a1.sources.r1.connection.url = jdbc:mysql://192.168.31.10:3306/mysqlsource
a1.sources.r1.connection.user = root
a1.sources.r1.connection.password = root
a1.sources.r1.max.row = 2000
a1.sources.r1.table = student
a1.sources.r1.columns.to.select = *
#a1.sources.r1.incremental.column.name = id
#a1.sources.r1.incremental.value = 0
a1.sources.r1.run.query.delay=5000

# Describe the sink
a1.sinks.k1.type = logger

# Describe the channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

mysql建表语句:

CREATE DATABASE mysqlsource;

CREATE TABLE `student` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) NOT NULL,
PRIMARY KEY (`id`)
);

CREATE TABLE `flume_meta` (
`source_tab` varchar(255) NOT NULL,
`currentIndex` varchar(255) NOT NULL,
PRIMARY KEY (`source_tab`)
);

insert into `student` (`id`, `name`) values('1','zhangsan');
insert into `student` (`id`, `name`) values('2','lisi');
insert into `student` (`id`, `name`) values('3','wangwu');
insert into `student` (`id`, `name`) values('4','zhaoliu');
insert into `student` (`id`, `name`) values('5','xiaoming');
insert into `student` (`id`, `name`) values('6','xiaoliang');

编写了个脚本方便测试(使用了ganglia ,没配置可以删掉后两行):

#!/bin/bash


read -p "enter agent name: " AGENT_NAME
if [ -z $AGENT_NAME ];then
  echo "Error"
  exit
fi


read -p "enter job config file  name: " FILE_NAME
if [ -z $FILE_NAME ];then
  echo "Error"
  exit
fi



/home/hadoop/apps/flume-1.9.0/bin/flume-ng agent \
-c /home/hadoop/apps/flume-1.9.0/conf/ \
-n $AGENT_NAME \
-f /home/hadoop/apps/flume-1.9.0/job/$FILE_NAME \
-Dflume.root.logger=INFO,console \
-Dflume.monitoring.type=ganglia \
-Dflume.monitoring.hosts=192.168.31.10:8649

执行脚本:查看结果

mysql source一整个文件夹_mysql source一整个文件夹