1.设计图
pdf解析可实现指定页码先进行解析,然后调换解析顺序
2.处理流程图
3.数据库、redis设计
3.代码实现
pdf解析工具使用pdfium,仓库地址:https://github.com/klippa-app/pdfium-cli
3.1.client_pdfium.go(底层代码)
package pdf_util
import (
"fmt"
"os/exec"
"path"
"strconv"
"support/logger"
)
type pdfium struct {
parseTool string
dpi string
maxHeight int
maxWidth int
}
func newPdfium(dpi int) *pdfium {
res := &pdfium{
parseTool: getPdfiumTool(),
dpi: strconv.Itoa(dpi),
}
logger.Debug("new pdfium, dpi: %d, programName: %s", dpi, res.parseTool)
return res
}
func (p *pdfium) parse(log logger.ILog, filePath string, imgDir string, firstPage, lastPage int, dpi int) ([]byte, error) {
cmd := exec.Command(getBashTool(), "-c")
arg := fmt.Sprintf("%s render %s --dpi %d --pages %d-%d %s", getPdfiumTool(),
filePath, dpi, firstPage, lastPage, path.Join(imgDir, "%d.jpg"))
cmd.Args = append(cmd.Args, arg)
log.Debug("pdfium cmd is: %s", cmd)
res, err := cmd.CombinedOutput()
log.Debug("pdfium exec result:%s", res)
return res, err
}
3.2 业务代码
dto.go
package pdf_parse_v2
import (
"encoding/json"
"github.com/gin-gonic/gin"
"path"
"pps/config"
"pps/dao"
pfc "pps/helper/pdf_cache"
"strconv"
"support/collection/_set"
"support/util"
"support/web/mw"
"time"
)
const (
metaFileName = "meta.json"
)
const maxTaskExpireParam = 5
type requestParam struct {
FileId string `json:"fileId" form:"fileId" binding:"required"`
FileOssPath string `json:"fileOssPath" form:"fileOssPath" binding:"required"`
PrePage int `json:"prePage" form:"prePage"`
imgOssPath string
metaFileKey string
DisablePicCompress bool `json:"disablePicCompress" form:"disablePicCompress"`
}
type requestJumpParam struct {
FileId string `json:"fileId" form:"fileId" binding:"required"`
PageNo int `json:"pageNo" form:"pageNo" binding:"required"`
}
func (p *requestParam) init() {
p.imgOssPath = getImageOssPath(p.FileOssPath)
p.metaFileKey = path.Join(p.imgOssPath, metaFileName)
}
func getImageOssPath(fileOssPath string) string {
return fileOssPath + "_i"
}
func getMetaFileKey(fileOssPath string) string {
return path.Join(getImageOssPath(fileOssPath), metaFileName)
}
type ParseResult struct {
TotalPage int `json:"totalPage"`
CoverWidth float64 `json:"coverWidth"`
CoverHeight float64 `json:"coverHeight"`
ParseStatus int `json:"parseStatus"`
}
func (r *ParseResult) String() string {
return util.ConvertToJsonStr(r)
}
type parseMsg struct {
localPdfPath string
remotePdfPath string
localImgDirPrefix string
remoteImgDir string
totalPage int
width float64 // pdf文档宽高
height float64
metaFileKey string
reqTime time.Time
ctx *gin.Context
prePage int
imageDpi int
imageWidth float64 // pdf封面图片宽高
imageHeight float64
disablePicCompress bool // 是否禁用压缩图片
fileId string
fileOssPath string
}
func toParseMsg(pdfInfo *pfc.PdfInfo, param *requestParam, ctx *gin.Context) *parseMsg {
localImgDirPrefix := getLocalImgDirPrefix(param.FileId)
return &parseMsg{
localPdfPath: pdfInfo.LocalPath,
remotePdfPath: pdfInfo.RemotePath,
localImgDirPrefix: localImgDirPrefix,
remoteImgDir: param.imgOssPath,
totalPage: pdfInfo.TotalPage,
width: pdfInfo.Width,
height: pdfInfo.Height,
metaFileKey: param.metaFileKey,
ctx: ctx,
prePage: param.PrePage,
imageDpi: config.Config.ImageDpi,
disablePicCompress: param.DisablePicCompress,
fileId: param.FileId,
fileOssPath: param.FileOssPath,
}
}
func (m *parseMsg) getLocalImgDir(firstPage int) string {
return getLocalImageDir(m.localImgDirPrefix, firstPage)
}
func (m *parseMsg) getCost() time.Duration {
return mw.GetCost(m.ctx)
}
func buildPdfParseRecord(msg *parseMsg, result *ParseResult,
taskPage int, taskEstimateMs int64) *dao.TblPdfParseRecord {
c := config.Config
pending := calPendingList(msg.prePage, taskPage, result.TotalPage)
base := dao.BasePdfParseRecord{
Status: dao.StParsing,
FileId: msg.fileId,
FileOssPath: msg.fileOssPath,
Dpi: msg.imageDpi,
Tool: c.ParseTool,
Width: result.CoverWidth,
Height: result.CoverHeight,
TaskPage: taskPage,
TaskEstimateMs: taskEstimateMs,
ExpireTime: getExpireTime(taskEstimateMs),
RetryCount: 0,
DisablePicCompress: msg.disablePicCompress,
}
return &dao.TblPdfParseRecord{
PrePage: msg.prePage,
TotalPage: result.TotalPage,
Pending: util.ConvertToJsonStr(pending),
Parsing: firstNum,
BasePdfParseRecord: base,
}
}
func calPendingList(prePage, taskPage, totalPage int) []int {
pending := make([]int, 0)
for start := prePage + 1; start <= totalPage; start = start + taskPage {
pending = append(pending, start)
}
return pending
}
func getExpireTime(taskEstimateMs int64) int64 {
return util.NowMs() + maxTaskExpireParam*taskEstimateMs
}
func getLocalImgDirPrefix(fileId string) string {
return path.Join(config.Config.BaseDir, "img", fileId)
}
func getLocalImageDir(localImgDirPrefix string, firstPage int) string {
return path.Join(localImgDirPrefix, strconv.Itoa(firstPage))
}
func getImageOssCompressDir(imageOssPath string) string {
return imageOssPath + "_z"
}
type JumpHis struct {
PrePage int `json:"prePage"`
TotalPage int `json:"totalPage"`
TaskPage int `json:"taskPage"`
History []int `json:"history"`
}
func (j *JumpHis) String() string {
return util.ConvertToJsonStr(j)
}
func buildJumpHis(record *dao.TblPdfParseRecord, his []int) *JumpHis {
prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
totalPendingSet := _set.NewBySlice(calPendingList(prePage, taskPage, totalPage))
var pending []int
_ = json.Unmarshal([]byte(record.Pending), &pending)
pendingSet := _set.NewBySlice(pending)
// 已经解析过的
history := _set.Difference(totalPendingSet, pendingSet).Slice()
hisSet := _set.NewBySlice(his)
history = _set.Union(hisSet, _set.NewBySlice(history)).Slice()
return &JumpHis{
PrePage: prePage,
TaskPage: taskPage,
TotalPage: totalPage,
History: history,
}
}
func baseRecord2ParseMsg(pdfInfo *pfc.PdfInfo, record dao.BasePdfParseRecord) *parseMsg {
localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
return &parseMsg{
localPdfPath: pdfInfo.LocalPath,
remotePdfPath: pdfInfo.RemotePath,
localImgDirPrefix: localImgDirPrefix,
remoteImgDir: getImageOssPath(record.FileOssPath),
width: record.Width,
height: record.Height,
metaFileKey: getMetaFileKey(record.FileOssPath),
imageDpi: record.Dpi,
disablePicCompress: record.DisablePicCompress,
fileId: record.FileId,
fileOssPath: record.FileOssPath,
}
}
deal_parse.go
package pdf_parse_v2
import (
"github.com/gin-gonic/gin"
"github.com/pkg/errors"
math2 "math"
"os"
"path"
"pps/application/common"
"pps/application/compress"
util2 "pps/application/util"
"pps/config"
"pps/dao"
"pps/helper/img_util"
pfc "pps/helper/pdf_cache"
"pps/helper/pdf_util"
"strings"
"support/concurrent"
"support/http_util"
"support/logger"
"support/math"
"support/oss"
"support/safe"
"support/util"
db2 "support/web/db"
"support/web/mw"
)
const firstNum = 1
const BigImageDpi = 96
// BestParseTime 最佳解析时间15s
const BestParseTime = 15000
func DealParse(c *gin.Context) {
common.CallBegin()
defer common.CallEnd()
log := mw.GetLogger(c)
// 参数校验
var param requestParam
if err := c.ShouldBindQuery(¶m); err != nil {
log.Error("DealParse request param failed as: %s", err)
mw.RetFail(c, mw.ErrBadParam)
return
}
param.init()
// 检查是否解析过
if pr := getParseResult(log, ¶m); pr != nil {
log.Warn("skip parse as already parsed, meta: %s", pr)
mw.RetJSON(c, pr)
return
}
// 获取pdf信息(文件本地路径,页数,宽高)
pdfInfo, err := pfc.GetPdf(log, param.FileOssPath)
if err != nil {
log.Error("GetPdf failed as: %s", err)
mw.RetFail(c, mw.ErrInner)
return
}
log.Info("get pdf info finished, cost: %s, pdfInfo: %s", mw.GetCost(c), pdfInfo)
// 最多解析1000页
maxParseCount := config.Config.TotalParseCount
if pdfInfo.TotalPage > maxParseCount {
log.Error("pdf totalPage more than maxParseCount, totalPage %d, maxParseCount: %d",
pdfInfo.TotalPage, maxParseCount)
pdfInfo.TotalPage = maxParseCount
}
msg := toParseMsg(pdfInfo, ¶m, c)
// 预解析
pr, httpErr := preParse(c, msg)
if httpErr != nil {
mw.RetFail(c, httpErr)
return
}
// 启动协程,继续解析剩下的
go safe.Safego(func() {
leftParse(c.Copy(), msg)
}, "leftParse")
// 返回
mw.RetJSON(c, pr)
return
}
// getParseResult 如果解析过,图片目录中会有meta文件
func getParseResult(log logger.ILog, param *requestParam) *ParseResult {
if r, _ := oss.Helper.IsObjectExist(param.metaFileKey); !r {
return nil
}
res := ParseResult{}
err := util2.DownloadData(param.metaFileKey, &res)
if err != nil {
log.Error("error while download meta file: %s", err)
return nil
}
return &res
}
func preParse(c *gin.Context, msg *parseMsg) (r *ParseResult, he *http_util.HttpError) {
log := mw.GetLogger(c)
db := db2.Db(c)
msg.prePage = util.If(msg.prePage == 0, config.Config.PreParseCount, msg.prePage)
lastPage := math.Min(msg.prePage, msg.totalPage)
if err := parseAndUpload(c, msg, firstNum, lastPage); err != nil {
log.Error("pre parse failed while parse as: %s", err)
return nil, mw.ErrInner
}
log.Info("pre parse upload image finished")
parseStatus := util.If(msg.totalPage <= msg.prePage, dao.StParseSuccess, dao.StParsing)
pr := &ParseResult{
TotalPage: msg.totalPage,
CoverWidth: msg.imageWidth,
CoverHeight: msg.imageHeight,
ParseStatus: parseStatus,
}
log.Info("pre parse set size finished, result: %s", pr)
// 上传meta文件
if err := util2.UploadData(msg.metaFileKey, pr); err != nil {
log.Error("pre parse failed while upload meta file as: %s", err)
return nil, mw.ErrInner
}
// 计算平均每页耗时
preParseCost := msg.getCost()
log.Info("pre parse upload meta file finished cost: %s", preParseCost)
avgPageCost := preParseCost.Milliseconds() / int64(msg.prePage)
taskPage := math.Max(config.Config.PreParseCount, int(BestParseTime/avgPageCost))
taskEstimateMs := int64(taskPage) * avgPageCost
e := buildPdfParseRecord(msg, pr, taskPage, taskEstimateMs)
if err := dao.SavePdfParseRecord(db, e); err != nil {
log.Error("SavePdfParseRecord failed as: %s", err)
return pr, mw.ErrDb
}
return pr, nil
}
func parseAndUpload(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {
log := mw.GetLogger(c)
localImgDir := msg.getLocalImgDir(firstPage)
if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
return errors.Wrap(err, "mkdir")
}
defer func() {
if err := os.RemoveAll(localImgDir); err != nil {
log.Error("remove dir failed as: %s", err)
}
}()
// 解析成图片
res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
firstPage, lastPage, msg.imageDpi)
// 检查是否是因为图片太大,再执行一次
res, err = checkImageParseErr(log, res, err, msg, firstPage, lastPage, localImgDir)
if err != nil {
return errors.Wrap(err, "parse pdf")
}
wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
firstPage, lastPage, wrappedRes)
// dpi检查,封面图片过大则调整dpi
if firstPage == firstNum {
if err := checkDpi(c, msg, firstPage, lastPage); err != nil {
return errors.Wrap(err, "check dpi")
}
}
// 上传图片
if err := uploadImage(localImgDir, msg.remoteImgDir, firstPage, lastPage); err != nil {
return errors.Wrap(err, "upload image")
}
// 压缩图片 根据参数开启
if !msg.disablePicCompress {
if err := compressImage(c, msg.remoteImgDir, firstPage, lastPage); err != nil {
return errors.Wrap(err, "compressImage")
}
}
return nil
}
func uploadImage(localImgDir, remoteImgDir string, firstPage, lastPage int) error {
// 并发上传图片
limit := concurrent.NewGoLimit(5)
var err error
for i := firstPage; i <= lastPage; i++ {
pageNo := i // 防止闭包i变化
imgName := pdf_util.GetImgName(pageNo)
localImgPath := path.Join(localImgDir, imgName)
remoteImgPath := path.Join(remoteImgDir, imgName)
limit.Run(func() {
if e := oss.Helper.PutFile(remoteImgPath, localImgPath, oss.AclPublicRead); e != nil {
err = e
}
})
}
limit.Wait()
return err
}
func compressImage(c *gin.Context, remoteImgDir string, firstPage, lastPage int) error {
// 压缩图片
limit := concurrent.NewGoLimit(10)
for i := firstPage; i <= lastPage; i++ {
imgName := pdf_util.GetImgName(i)
imgOssPath := path.Join(remoteImgDir, imgName)
limit.RunError(func() error {
return compress.CompressImage(c, imgOssPath)
})
}
limit.Wait()
return limit.FirstError()
}
func checkDpi(c *gin.Context, msg *parseMsg, firstPage, lastPage int) error {
localImgDir := msg.getLocalImgDir(firstPage)
log := mw.GetLogger(c)
localImgPath := path.Join(localImgDir, pdf_util.GetImgName(firstNum))
imgDim, err := img_util.GetLocalImageDim(localImgPath)
if err != nil {
return errors.Wrap(err, "get local image")
}
// 修改封面宽高
msg.imageWidth = float64(imgDim.Width)
msg.imageHeight = float64(imgDim.Height)
maxWidth, maxHeight, maxSize := config.Config.ImgAttrMaxValue[0], config.Config.ImgAttrMaxValue[1], config.Config.ImgAttrMaxValue[2]
if imgDim.Width == 0 || imgDim.Height == 0 {
return errors.New("get imageDim failed Width or Height is 0")
}
rw, rh, rs := float64(imgDim.Width)/maxWidth, float64(imgDim.Height)/maxHeight,
math2.Sqrt(float64(imgDim.Width*imgDim.Height)/maxSize)
rr := math.Max(rw, rh, rs)
// 需要调整dpi
if rr > 1.0 {
dpi := float64(msg.imageDpi) / rr
msg.imageDpi = int(dpi)
log.Error("checkDpi image is too large,dpi change to %d", msg.imageDpi)
// 解析成图片
res, err := pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
firstPage, lastPage, msg.imageDpi)
if err != nil {
return errors.Wrap(err, "parse pdf")
}
// 获取调整dpi后的封面宽高
wrappedRes := wrapOssPath(string(res), msg.remotePdfPath)
log.Info("parse pdf finished, firstPage: %d, lastPage: %d, res: \n%s",
firstPage, lastPage, wrappedRes)
imgDim, err := img_util.GetLocalImageDim(localImgPath)
if err != nil {
return errors.Wrap(err, "get local image")
}
// 修改封面宽高
msg.imageWidth = float64(imgDim.Width)
msg.imageHeight = float64(imgDim.Height)
}
return nil
}
func wrapOssPath(res string, ossPath string) string {
res = strings.ReplaceAll(res, "\r\n", "\n")
res = strings.Trim(res, " \n")
lines := strings.Split(res, "\n")
for i, line := range lines {
lines[i] = line + " (" + ossPath + ")"
}
return strings.Join(lines, "\n")
}
func checkImageParseErr(log logger.ILog, res []byte, err error, msg *parseMsg, firstPage, lastPage int, localImgDir string) ([]byte, error) {
// 降dpi再执行一次
if strings.Contains(string(res), pdf_util.PdfiumBigImageError.Error()) {
msg.imageDpi = BigImageDpi
log.Error("image is too large,dpi change to %d", msg.imageDpi)
res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
firstPage, lastPage, msg.imageDpi)
return res, err
}
if err != nil && strings.Contains(err.Error(), "killed") {
msg.imageDpi = BigImageDpi
log.Error("pdf parse is failed as killed by signal,dpi change to %d", msg.imageDpi)
res, err = pdf_util.ParsePdf(log, msg.localPdfPath, localImgDir,
firstPage, lastPage, msg.imageDpi)
return res, err
}
return res, err
}
deal_parse_left.go
package pdf_parse_v2
import (
"encoding/json"
"github.com/gin-gonic/gin"
"github.com/pkg/errors"
"os"
"path"
"pps/dao"
"pps/helper/img_util"
"pps/helper/pdf_util"
"support/math"
"support/oss"
"support/util"
db2 "support/web/db"
"support/web/mw"
)
func leftParse(c *gin.Context, msg *parseMsg) {
log := mw.GetLogger(c)
db := db2.Db(c)
for {
record, err := dao.GetPdfParseRecordByFileId(db, msg.fileId)
if err != nil {
log.Error("GetPdfParseRecordByFileId failed as err:%v", err)
return
}
var pending []int
if err = json.Unmarshal([]byte(record.Pending), &pending); err != nil {
log.Error("Unmarshal pending failed as err:%v", err)
return
}
if len(pending) == 0 {
if err := dao.UpdatePdfParseRecordStatus(db, msg.fileId, dao.StParseSuccess); err != nil {
log.Error("UpdatePdfParseRecordStatus failed as err:%v", err)
}
return
}
// 取第一条进行解析任务
firstPage := pending[0]
lastPage := math.Min(record.TotalPage, firstPage+record.TaskPage-1)
newPending := util.ConvertToJsonStr(pending[1:])
expireTime := getExpireTime(record.TaskEstimateMs)
if err := dao.UpdatePdfParseRecord(db, msg.fileId, newPending, firstPage, msg.imageDpi, expireTime); err != nil {
log.Error("UpdatePdfParseRecordPending failed as err:%v", err)
return
}
// 调用解析
if err := parseAndUpload(c, msg, firstPage, lastPage); err != nil {
// 切换破图上传oss
if err := GenFailedImageUpload(c, record.BasePdfParseRecord, firstPage, lastPage); err != nil {
log.Error("GenFailedImageUpload failed as err:%v", err)
}
}
}
}
func GenFailedImageUpload(c *gin.Context, record dao.BasePdfParseRecord, firstPage, lastPage int) error {
log := mw.GetLogger(c)
localImgDirPrefix := getLocalImgDirPrefix(record.FileId)
localImgDir := getLocalImageDir(localImgDirPrefix, firstPage)
if err := os.MkdirAll(localImgDir, os.ModePerm); err != nil {
return errors.Wrap(err, "mkdir")
}
imgName := pdf_util.GetImgName(firstPage)
localImgPath := path.Join(localImgDir, imgName)
err := GenFailedImage(localImgPath, record.Width, record.Height)
if err != nil {
return err
}
remoteImgOssPath := getImageOssPath(record.FileOssPath)
// 上传图片
if err := uploadImage(localImgDir, remoteImgOssPath, firstPage, firstPage); err != nil {
return errors.Wrap(err, "upload image")
}
// 压缩图片 根据参数开启
if !record.DisablePicCompress {
if err := compressImage(c, remoteImgOssPath, firstPage, firstPage); err != nil {
return errors.Wrap(err, "compressImage")
}
}
// 剩下的不压缩,直接拷贝
for start := firstPage + 1; start <= lastPage; start++ {
// oss拷贝
srcImg := path.Join(remoteImgOssPath, imgName)
dstImg := path.Join(remoteImgOssPath, pdf_util.GetImgName(start))
log.Debug("oss copy src:%s, dst:%s", srcImg, dstImg)
err := oss.Helper.CopyObject(srcImg, dstImg)
if err != nil {
log.Error(" srcImg:%s, dstImg:%s oss copy failed as err:%v", srcImg, dstImg, err)
}
if !record.DisablePicCompress {
srcImgCompress := getImageOssCompressDir(srcImg)
dstImgCompress := getImageOssCompressDir(dstImg)
err := oss.Helper.CopyFolder(srcImgCompress, dstImgCompress)
if err != nil {
log.Error(" srcImgCompress:%s, dstImgCompress:%s oss copy failed as err:%v",
srcImgCompress, dstImgCompress, err)
}
}
}
return nil
}
func GenFailedImage(filePath string, width, height float64) error {
data, err := img_util.GenFailImage(width, height)
if err != nil {
return err
}
outFile, err := os.Create(filePath)
if err != nil {
return err
}
_, _ = outFile.Write(data)
_ = outFile.Close()
return nil
}
deal_parse_jump.go
package pdf_parse_v2
import (
"encoding/json"
"github.com/gin-gonic/gin"
"pps/dao"
pfc "pps/helper/pdf_cache"
"support/collection/_set"
"support/database/redis"
"support/logger"
"support/safe"
"support/util"
db2 "support/web/db"
"support/web/mw"
"time"
)
const maxJumpCount = 5
const PpsJumpHisCachePrefix = "PPS:PDF:JUMP_FILEID_"
func DealJump(c *gin.Context) {
log := mw.GetLogger(c)
db := db2.Db(c)
// 参数校验
var param requestJumpParam
if err := c.ShouldBindQuery(¶m); err != nil {
log.Error("DealJump request param failed as: %s", err)
mw.RetFail(c, mw.ErrBadParam)
return
}
fileId := param.FileId
jumpHis, _ := CacheGetJumpHis(log, fileId)
// redis有缓存
if len(jumpHis.History) > 0 {
if hasParse(jumpHis, param.PageNo, false) {
log.Debug("pageNo:%d hasParse task", param.PageNo)
mw.RetJSON(c, "")
return
}
}
record, err := dao.GetPdfParseRecordByFileId(db, fileId)
if err != nil {
log.Error("DealJump GetPdfParseRecordByFileId failed as: %s", err)
mw.RetFail(c, mw.ErrDb)
return
}
jumpHis = buildJumpHis(record, jumpHis.History)
log.Debug("jumpHis:%s", jumpHis)
if hasParse(jumpHis, param.PageNo, true) {
log.Debug("pageNo:%d hasParse task", param.PageNo)
mw.RetJSON(c, "")
return
}
// 更新his
_ = CacheSetJumpHis(log, fileId, jumpHis)
// 调整任务顺序
pending := adjustPendingList(record, param.PageNo)
defer func() {
// 更新pending
if err = dao.UpdatePdfParseRecordPending(db, fileId, util.ConvertToJsonStr(pending)); err != nil {
log.Error("DealJump UpdatePdfParseRecordPending failed as: %s", err)
}
}()
// 更新解析记录
jumpCount, err := dao.GetPdfParseJumpCount(db, fileId)
if err != nil {
log.Error("DealJump GetPdfParseJumpCount failed as: %s", err)
mw.RetFail(c, mw.ErrDb)
return
}
if jumpCount >= maxJumpCount {
log.Warn("currJumpCount:%d gt maxJumCount:%d", jumpCount, maxJumpCount)
mw.RetJSON(c, "")
return
}
fromPage := pending[0]
expireTime := getExpireTime(record.TaskEstimateMs)
jumpRecord := dao.BuildPdfParseJumpFromParseRecord(record, fromPage, expireTime)
// 创建跳页记录
if err = dao.SavePdfParseJump(db, jumpRecord); err != nil {
log.Error("DealJump SavePdfParseJump failed as: %s", err)
mw.RetFail(c, mw.ErrDb)
return
}
pending = pending[1:]
mw.RetJSON(c, "")
// 处理跳页
go safe.Safego(func() {
dealJump(c.Copy(), jumpRecord)
}, "dealJump")
}
func CacheGetJumpHis(log logger.ILog, fileId string) (*JumpHis, error) {
var his JumpHis
key := PpsJumpHisCachePrefix + fileId
value, err := redis.Get(key)
if err != nil {
if err != redis.ErrGetNil {
log.Error("CacheGetJumpHis, fileId: %s, err: %s", fileId, err)
}
return &his, err
}
err = json.Unmarshal([]byte(value), &his)
return &his, err
}
func CacheSetJumpHis(log logger.ILog, fileId string, his *JumpHis) error {
key := PpsJumpHisCachePrefix + fileId
value, err := json.Marshal(his)
if err != nil {
log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
return err
}
err = redis.Set(key, string(value), time.Hour)
if err != nil {
log.Error("CacheSetJumpHis, fileId: %s, err: %s", fileId, err)
}
return err
}
func hasParse(his *JumpHis, pageNo int, insert bool) bool {
prePage, taskPage, totalPage := his.PrePage, his.TaskPage, his.TotalPage
plist := calPendingList(prePage, taskPage, totalPage)
// 开头补上0 表示预解析
plist = append([]int{0}, plist...)
start := calPageStart(plist, pageNo)
hisSet := _set.NewBySlice(his.History)
hisSet.Add(0)
if hisSet.Has(start) {
return true
}
if insert {
// 本次操作加进his
hisSet.Add(start)
}
his.History = hisSet.Slice()
return false
}
// 二分法查找仅次于target的数字
func calPageStart(arr []int, target int) int {
// 边界情况处理
if len(arr) == 0 {
return 0 // 如果数组空,返回0(一般不会运行到这里)
}
if target < arr[0] {
return arr[0]
}
if target > arr[len(arr)-1] {
return arr[len(arr)-1]
}
left := 0
right := len(arr) - 1
for left <= right {
mid := left + (right-left)/2
// 如果找到精确匹配
if arr[mid] == target {
return arr[mid]
} else if arr[mid] < target {
// 检查是否是最后一个小于n的元素
if mid == len(arr)-1 || arr[mid+1] > target {
return arr[mid]
}
left = mid + 1
} else {
right = mid - 1
}
}
return 0 // 默认返回0,一般不会运行到这里
}
func adjustPendingList(record *dao.TblPdfParseRecord, pageNo int) (pending []int) {
_ = json.Unmarshal([]byte(record.Pending), &pending)
if len(pending) == 0 {
return
}
prePage, taskPage, totalPage := record.PrePage, record.TaskPage, record.TotalPage
plist := calPendingList(prePage, taskPage, totalPage)
start := calPageStart(plist, pageNo)
// 已经排在第一个了
if start == pending[0] {
return
}
pending = rotateSlice(pending, start)
return
}
func rotateSlice(arr []int, value int) []int {
// 查找元素的位置
index := -1
for i, v := range arr {
if v == value {
index = i
break
}
}
// 如果没有找到指定的元素,返回原始切片
if index == -1 {
return arr
}
// 使用拼接方式旋转切片
result := append(arr[index:], arr[:index]...)
return result
}
func dealJump(c *gin.Context, record *dao.TblParseJump) {
log := mw.GetLogger(c)
db := db2.Db(c)
// 获取pdf信息(文件本地路径,页数,宽高)
pdfInfo, err := pfc.GetPdf(log, record.FileOssPath)
if err != nil {
log.Error("GetPdf failed as: %s", err)
return
}
msg := baseRecord2ParseMsg(pdfInfo, record.BasePdfParseRecord)
// 调用解析
if err := parseAndUpload(c, msg, record.FromPage, record.ToPage); err != nil {
// 切换破图上传oss
if err := GenFailedImageUpload(c, record.BasePdfParseRecord, record.FromPage, record.ToPage); err != nil {
log.Error("GenFailedImageUpload failed as err:%v", err)
}
}
if err := dao.UpdateJumpSuccess(db, record.FileId); err != nil {
log.Error("UpdateJumpSuccess failed as err:%v", err)
}
}
4.代码解读
跳页主要是调整解析队列pending的顺序,如果有可用协程就直接启动协程执行跳页解析
left解析主要就是从数据库的pending字段取出一个start 然后开始解析
5.番外篇pdfium介绍
PDFium 是一个开源的 PDF 渲染引擎,用于解析和呈现 PDF 文档。它最初由 Foxit Software 开发,随后由 Google 作为 Chromium 项目的一部分维护和发布。PDFium 被广泛应用于浏览器(如 Google Chrome)的内置 PDF 查看器,以及其他需要处理 PDF 文档的应用程序中。
PDFium 的主要功能
- PDF 渲染:PDFium 可以将 PDF 页面渲染为不同格式的图像(如 Bitmaps),并支持高效的缩放和旋转操作。
- 文本提取:可以提取 PDF 文档中的文本内容,便于文本检索和搜索引擎的索引。
- 表单处理:支持 PDF 表单的填充、提交和提取操作。
- 注释处理:支持读取和管理 PDF 注释,如高亮、注释和签名。
- 图像和图形处理:能够解析和渲染 PDF 文件中的图像和矢量图形。
- 安全性:支持解析和处理加密的 PDF 文档。
主要特性
- 跨平台支持:PDFium 可以在 Windows、macOS、Linux 等多个平台上编译和运行。
- 高性能:针对性能进行了优化,可以高效处理大型和复杂的 PDF 文档。
- 模块化:具有模块化架构,可以选择性编译和使用需要的功能模块。
- 丰富的 API:提供了丰富的 API,可以进行复杂的 PDF 操作和自定义扩展。
如何使用 PDFium
1. 下载和构建
PDFium 是一个开源项目,可以从其 GitHub 仓库下载源码:
git clone https://pdfium.googlesource.com/pdfium
你可以按照文档进行配置和编译,以适应不同平台和需求。常用的构建工具包括 CMake 和 Ninja。
2. 基本用例
构建完成后,你可以在你的 C/C++ 项目中使用 PDFium。以下是一个简单的示例,展示如何创建一个 PDF 文档并渲染第一页:
#include "public/fpdfview.h"
int main(int argc, char** argv) {
// 初始化 PDFium
FPDF_InitLibrary();
// 加载 PDF 文档
FPDF_DOCUMENT doc = FPDF_LoadDocument("example.pdf", nullptr);
if (!doc) {
FPDF_DestroyLibrary();
return -1;
}
// 加载第一页
FPDF_PAGE page = FPDF_LoadPage(doc, 0);
if (!page) {
FPDF_CloseDocument(doc);
FPDF_DestroyLibrary();
return -1;
}
// 渲染第一页到 Bitmap
int width = FPDF_GetPageWidth(page);
int height = FPDF_GetPageHeight(page);
FPDF_BITMAP bitmap = FPDFBitmap_Create(width, height, 0);
FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xFFFFFFFF); // 白色背景
FPDF_RenderPageBitmap(bitmap, page, 0, 0, width, height, 0, 0);
// 保存 Bitmap 到文件或进行其他处理
// ...
// 清理资源
FPDFBitmap_Destroy(bitmap);
FPDF_ClosePage(page);
FPDF_CloseDocument(doc);
FPDF_DestroyLibrary();
return 0;
}
常见应用场景
- 嵌入式 PDF 查看器:你可以在桌面或移动应用中嵌入一个自定义的 PDF 查看组件,为用户提供查看和互动功能。
- 服务器端 PDF 处理:在服务器端应用中使用 PDFium 批量处理 PDF 文档,实现自动化的文档管理和处理流程。
- PDF 数据提取:通过 PDFium 提取 PDF 文档中的文本、图像和其他数据,用于数据分析、数据挖掘等。
资源和社区
- 官方 GitHub 仓库:PDFium
- Google Groups:用于讨论和获取支持 pdfium-discuss
PDFium 作为一个强大且灵活的 PDF 引擎,广泛应用于各种需处理 PDF 文档的场景。如果你有特定的需求,深入研究 PDFium 的文档和示例,结合实际情况进行开发,可以达到最佳效果。