介绍
将MySQL数据库中的冷数据备份并上传至云平台对象存储的过程。冷数据是指数据库中的历史或不经常访问的数据。我们首先通过执行SQL查询语句从MySQL数据库中提取所需数据,然后将其保存为CSV文件格式,接着通过SDK将备份文件上传到对象存储。
查询
-- 查询最早的数据
SELECT *
FROM bos_balance_flow limit 5
-- 查询最后10条数据
SELECT *
FROM bos_balance_flow
ORDER BY created_at DESC
LIMIT 10;
添加索引
给时间字段加上索引提高查询速度
-- 给订单归档表加时间字段加索引
CREATE INDEX idx_created_at ON bos_order_archive(created_at);
历史数据上传s3
# 后台执行数据备份脚本
nohup python3 db-upload-mongo-s3.py &
# 一次性上传历史mysql数据到s3
import logging
from logging.handlers import RotatingFileHandler
import os
from datetime import datetime, timedelta
import pandas as pd
import pymysql
import boto3
import time
# 配置日志
logger = logging.getLogger('sql_logger')
logger.setLevel(logging.INFO)
# 配置RotatingFileHandler,设置日志文件路径、文件大小和文件数量
handler = RotatingFileHandler('/tmp/sql.log', maxBytes=50 * 1024 * 1024, backupCount=2)
# 设置日志格式
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# 添加handler到logger对象
logger.addHandler(handler)
# 添加日志输出到终端
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
# AWS S3 配置
AWS_ACCESS_KEY = "AKIASEGBIssss"
AWS_SECRET_KEY = "xxxxxxxx"
S3_BUCKET_NAME = "pro-s3-db"
S3_DIRECTORY = "mongo_bos_server"
# 数据库连接配置
DB_HOST = "172.16.99.99"
DB_USER = "root"
DB_PASSWORD = "xxxxxxxx"
DB_NAME = "bos_server"
# 日期格式
DATE_FORMAT = "%Y-%m-%d"
# 获取每个表的最早和最后日期
table_dates = {}
with pymysql.connect(host=DB_HOST, user=DB_USER, password=DB_PASSWORD, database=DB_NAME) as connection:
# 将需要处理的表添加到列表中
for table in ["bos_order_archive", "bos_order_future"]:
# 查询每个表中最早和最后的日期
min_date_query = f"SELECT MIN(created_at) FROM {table}"
max_date_query = f"SELECT MAX(created_at) FROM {table}"
min_date_result = pd.read_sql_query(min_date_query, connection)
max_date_result = pd.read_sql_query(max_date_query, connection)
# 转换日期格式
min_date = min_date_result.iloc[0, 0].strftime(DATE_FORMAT)
max_date = max_date_result.iloc[0, 0].strftime(DATE_FORMAT)
table_dates[table] = {"min_date": min_date, "max_date": max_date}
# 遍历每个表的日期范围
for table, dates in table_dates.items():
current_date = datetime.strptime(dates["min_date"], DATE_FORMAT)
end_date = datetime.strptime(dates["max_date"], DATE_FORMAT)
while current_date <= end_date:
current_date_str = current_date.strftime(DATE_FORMAT)
# 构建 SQL 查询语句
sql_query = f"SELECT * FROM {table} WHERE created_at >= '{current_date_str} 00:00:00.000' AND created_at < '{current_date_str} 23:59:59.999'"
# 使用 with 语句连接数据库
with pymysql.connect(host=DB_HOST, user=DB_USER, password=DB_PASSWORD, database=DB_NAME) as connection:
# 使用 pandas 读取数据库数据
df = pd.read_sql_query(sql_query, connection)
# 如果数据不为空则上传到S3
if not df.empty:
# 生成 CSV 文件名
csv_filename = f"{table}_{current_date_str}.csv"
# 保存数据到 CSV 文件
df.to_csv(csv_filename, index=False)
# 获取文件大小
file_size = os.path.getsize(csv_filename)
file_size_mb = file_size / (1024 * 1024) # Convert bytes to megabytes
# 构建S3目标路径
s3_object_key = f"{S3_DIRECTORY}/{csv_filename}"
# 使用 boto3 上传文件至 S3
s3_client = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
s3_client.upload_file(csv_filename, S3_BUCKET_NAME, s3_object_key)
# 记录日志
logger.info(f"文件 {csv_filename} 已上传到 S3 存储桶 {S3_BUCKET_NAME} 目录 {S3_DIRECTORY},文件大小: {file_size_mb:.2f} MB,上传成功")
# 等待5秒
time.sleep(5)
# 增加一天
current_date += timedelta(days=1)
每日数据上传s3
从一个数据库中获取前一天的数据。将数据存储到一个 CSV 文件中。 检查本地是否已存在该 CSV 文件,如果存在则不执行数据库查询,直接将已有文件上传到 Amazon S3 存储桶中。
import os
import pandas as pd
import boto3
from datetime import datetime, timedelta
import requests
import pymysql
def send_notification(title, content):
message = {
"msg_type": "text",
"content": {
"text": f"{title}\n{content}"
}
}
lark_headers = {"Content-Type": "application/json"}
webhook_url = "https://open.larksuite.com/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
response = requests.post(webhook_url, json=message, headers=lark_headers)
if response.status_code == 200:
print("Notification sent successfully.")
else:
print(f"Failed to send notification. Status code: {response.status_code}, Response text: {response.text}")
#print(response.text)
class S3Uploader:
def __init__(self, aws_access_key, aws_secret_key, s3_bucket_name):
self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
self.s3_bucket_name = s3_bucket_name
def file_exists_in_s3(self, s3_object_key):
try:
self.s3_client.head_object(Bucket=self.s3_bucket_name, Key=s3_object_key)
return True
except:
return False
def upload_to_s3(self, local_file_path, s3_object_key):
self.s3_client.upload_file(local_file_path, self.s3_bucket_name, s3_object_key)
print(f"File {local_file_path} uploaded to S3 bucket {self.s3_bucket_name} with key {s3_object_key}")
def main():
# 日期格式
DATE_FORMAT = "%Y-%m-%d"
# AWS S3 配置
AWS_ACCESS_KEY = "AKIASEGBxxxxx"
AWS_SECRET_KEY = "Myk97UInMQXXXX"
S3_BUCKET_NAME = "pro-s3-db"
# 数据库连接配置
DB_HOST = "pro-rds-xxxx-slave.xxxx.com"
DB_USER = "dwh_XXXX"
DB_PASSWORD = "XXXXXXX"
DB_NAME = "bos_x"
# 凌晨5点触发脚本,计算前一天的日期
yesterday = datetime.now() - timedelta(days=1)
yesterday_str = yesterday.strftime(DATE_FORMAT)
# 构建当月目录
current_month_directory = yesterday.strftime("%Y.%m")
# 生成 CSV 文件名,包含当月目录
#csv_filename = f"dwh_balance_flow_infos_{yesterday_str}.csv"
csv_filename = f"bos_balance_flow_{yesterday_str}.csv"
# 检查本地文件是否已存在
if os.path.exists(csv_filename):
print(f"File {csv_filename} already exists locally. Skipping database query")
# 初始化 S3 上传器
s3_uploader = S3Uploader(AWS_ACCESS_KEY, AWS_SECRET_KEY, S3_BUCKET_NAME)
# 构建 S3 对象键
s3_object_key = f"{current_month_directory}/{csv_filename}"
# 检查文件是否已存在于 S3 中
if s3_uploader.file_exists_in_s3(s3_object_key):
print(f"File {csv_filename} already exists in S3. Skipping upload.")
# 发送通知到Lark
file_size = os.path.getsize(csv_filename)
file_size_mb = file_size / (1024 * 1024) # Convert bytes to megabytes
message = f"S3路径:/{s3_object_key} {file_size_mb:.2f} MB 检测已存在"
send_notification("只读库bos_balance_flow备份成功:", message)
else:
# 上传文件至 S3
s3_uploader.upload_to_s3(csv_filename, s3_object_key)
else:
# 使用 with 语句连接数据库
with pymysql.connect(host=DB_HOST, user=DB_USER, password=DB_PASSWORD, database=DB_NAME) as connection:
# 使用 pandas 读取数据库数据
sql_query = f"SELECT * FROM bos_balance_flow WHERE created_at >= '{yesterday_str} 00:00:00.000' AND created_at < '{yesterday_str} 23:59:59.999'"
#sql_query = f"SELECT * FROM dwh_balance_flow_infos WHERE created_at >= '{yesterday_str} 00:00:00.000' AND created_at < '{yesterday_str} 23:59:59.999'"
df = pd.read_sql_query(sql_query, connection)
# 保存数据到 CSV 文件
df.to_csv(csv_filename, index=False)
# 初始化 S3 上传器
s3_uploader = S3Uploader(AWS_ACCESS_KEY, AWS_SECRET_KEY, S3_BUCKET_NAME)
# 构建 S3 对象键
s3_object_key = f"{current_month_directory}/{csv_filename}"
# 检查文件是否已存在于 S3 中
if s3_uploader.file_exists_in_s3(s3_object_key):
print(f"File {csv_filename} already exists in S3. Skipping upload !!")
# 发送通知到Lark
file_size = os.path.getsize(csv_filename)
file_size_mb = file_size / (1024 * 1024) # Convert bytes to megabytes
message = f"S3路径:/{s3_object_key} {file_size_mb:.2f} MB 检测已存在"
send_notification("只读库bos_balance_flow备份成功:", message)
else:
# 上传文件至 S3
s3_uploader.upload_to_s3(csv_filename, s3_object_key)
# 发送通知到Lark
file_size = os.path.getsize(csv_filename)
file_size_mb = file_size / (1024 * 1024) # Convert bytes to megabytes
message = f"S3路径:/{s3_object_key} {file_size_mb:.2f} MB"
send_notification("只读库bos_balance_flow备份成功:", message)
if __name__ == "__main__":
main()