python基础

模块导入

根据需要添加相应模块

from bs4 import BeautifulSoup
from lxml import html
from w3lib import html
from lxml import etree
from docx import Document
from w3lib.html import remove_tags
from docx.shared import Inches
import xml
import requests
import docx
import xlwt
import openpyxl
import xlrd

出现如下情况

pythonword表格 python填写word表格_pythonword表格

pythonword表格 python填写word表格_超链接_02

导入过程:

打开控制台cmd

cd /d +python pip所在文件路径切换文件夹

pythonword表格 python填写word表格_python_03

模块导入 pip install + 模块名

pycharm没有自动添加时:
File-Settings-Project worms-ProjectInterpreter 点击右侧加号“+”手动选取,选择后点击Install Package添加即可。

爬虫部分

代码包括:python写word文档(文字、表格、选择表格样式等)
表格样式选择:

###爬虫部分###
###读入全部网址缓存
data = []
###从文档中读取已存储的网址链接
for line in open("C:\\Users\\sypgh\\Desktop\\http.txt","r"):
    data.append(line)
i = 0
for sen in data:
    ###页面读取
    url = data[i]
    i = i+1
    if(url != ""):
        f = requests.get(url)
        soup = BeautifulSoup(f.content,"lxml")
        #print(f.content.decode())

        ###基本信息读取
        m = soup.find('font',size='2')
        #print(m.get_text())
        sta = m.get_text()

        ###字符串处理
        school = ""#学校名称
        major = ""#专业名称
        pici = ""#批次  暂时没用
        kelei = ""#科类  暂时没用
        num = 0
        for j in sta:
            #if(num == 1):
             #   pici = pici + j;
            #if(num == 3):
             #   kelei = kelei + j;
            if(num == 5 and j!='\n'):
                school = school + j;
            if(num == 7 and j!='\n'):
                major = major + j;
            if (j == '\n'):
                num = num + 1;
        print(str(i)+"/3250"+"school:"+school+"major:"+major)
        ###文档写入
        file = docx.Document()
        file.add_paragraph(sta)
        table = file.add_table(1,7,style="Light List Accent 3")
        heading_cells = table.rows[0].cells
        heading_cells[0].text = u'考生号'
        heading_cells[1].text = u'姓名'
        heading_cells[2].text = u'投档成绩'
        heading_cells[3].text = u'录取方式'
        heading_cells[4].text = u'录取志愿'
        heading_cells[5].text = u'计划性质'
        heading_cells[6].text = u'录取时间'
        #print()

        ###学生信息读取处理
        l=0
        for k in soup.find_all('tr'):
            a = k.find_all('td')
            if(l!=0):
                cells = table.add_row().cells
                cells[0].text = a[0].string
                cells[1].text = a[1].string
                cells[2].text = a[2].string
                cells[3].text = a[3].string
                cells[4].text = a[4].string
                cells[5].text = a[5].string
                cells[6].text = a[6].string
                #print(a[0].string+' '+a[1].string+' '+a[2].string+' '+a[3].string+' '+a[4].string+' '+' '+a[5].string+' '+a[6].string)
            #person = a[0].string+' '+a[1].string+' '+a[2].string+' '+a[3].string+' '+a[4].string+' '+' '+a[5].string+' '+a[6].string
            #file.add_paragraph(person)
            l = l+1
        ###文档存储
        file.save("D:\\2019高考数据\\理科专科\\专业\\"+school+major+".docx")


#sta存储学校专业批次等基本信息
#person是【局部变量】,存储学生的个人信息

文档处理部分

offfice操作

竟然不会,竟然可以这样系列

1.提取excel表格中的超链接

pythonword表格 python填写word表格_超链接_04


如图所示的表格中,蓝色部分插入了超链接,现在需要把这些超链接表示的网址提取出来。

Alt F11打开界面,插入-模块中输入代码

pythonword表格 python填写word表格_python_05

Sub test()
For Each cell In Range("C1:C3989")
On Error Resume Next
cell.Offset(0, 10).Value = cell.Hyperlinks(1).Address
Next
End Sub

C1:C3989表示超链接存在的行列范围,根据需求改变
On Error Resume Next表示跳过错误行,可以自动跳过不处理空行(之前由于存在空行会报错)
cell.Offset(0, 10).Value括号中的数值表示新产生的链接与当前表格元素的相对位置,我把它设置在了右侧10列,可以根据需要修改。

输入代码后F5运行即可

结果如图所示,右侧出现了提取出来的连接

pythonword表格 python填写word表格_数据_06


2.Excel去除空行(整行)

ctrl+g打开定位,选择定位条件,选择空值,删除整行(下方单元格上移)即可

python操作word和Excel

代码包括:python读Excel表格,Python写Excel表格,Python写入Excel表格超链接等

###文档处理部分###
file_path = r'C:\\Users\\sypgh\\Desktop\\2019.xls'#文件路径
data = xlrd.open_workbook(file_path)#获取数据
table = data.sheet_by_name("普理专科")#获取sheet
rownum = table.nrows#行数
colnum = table.ncols#列数

#逐行处理
flag_title = 0#标记是否是学校基本信息
flag_in = 1#标记是否打专业表头
school = ""#学校名
#file = docx.Document()
file = xlwt.Workbook(encoding='utf-8')
sheet1 = file.add_sheet('院校信息')
sheet2 = file.add_sheet('专业信息')
line = 1#记录Excel写了多少行
#table_school = file.add_table(1, 4, style="Light List Accent 3")#填报次序 最高分 最低分 总人数
#table_major = file.add_table(1, 7, style="Light List Accent 3")#专业代号 专业名称 填报次序 最高分 最低分 最低分位次 录取人数
for row in range(1,rownum):
    print(str(row)+"/"+str(rownum))
    # 内部专业信息->写入表格,做链接
    if (table.cell(row,2).value != ""):
        flag_title = 0
        code = ""
        if (isinstance(table.cell(row, 1).value, float)):
            code = str(int(table.cell(row, 1).value))
        else:
            code = str(table.cell(row, 1).value)
        code = code.zfill(2)
        link = "D:\\2019高考数据\\理科专科\\专业\\"+school+code+str(table.cell(row, 2).value)+".docx"
        #print(link)
        if(flag_in == 1):
            flag_in = 0
            line = 1
            #file.add_paragraph("学校专业志愿填报信息:")
            sheet2.write(line, 0, "专业代号")
            sheet2.write(line, 1, "专业名称")
            sheet2.write(line, 2, "填报次序")
            sheet2.write(line, 3, "最高分")
            sheet2.write(line, 4, "最低分")
            sheet2.write(line, 5, "最低分位次")
            sheet2.write(line, 6, "录取人数")
            line = line + 1
            if (isinstance(table.cell(row, 1).value, float)):
                sheet2.write(line, 0, str(int(table.cell(row, 1).value)))
            else:
                sheet2.write(line, 0, str(table.cell(row, 1).value))
            sheet2.write(line, 1, str(table.cell(row, 2).value))
            sheet2.write(line, 2, str(table.cell(row, 3).value))
            sheet2.write(line, 3, str(int(table.cell(row, 4).value)))
            sheet2.write(line, 4, str(int(table.cell(row, 5).value)))
            if (isinstance(table.cell(row, 6).value, float)):
                sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
            else:
                sheet2.write(line, 5, str(table.cell(row, 6).value))
            #sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
            sheet2.write(line, 6, str(int(table.cell(row, 7).value)))
            sheet2.write(line, 7, xlwt.Formula('HYPERLINK("'+link+'";"详细")'))
            line = line + 1

        else:
            if (isinstance(table.cell(row, 1).value, float)):
                sheet2.write(line, 0, str(int(table.cell(row, 1).value)))
            else:
                sheet2.write(line, 0, str(table.cell(row, 1).value))
            sheet2.write(line, 1, str(table.cell(row, 2).value))
            sheet2.write(line, 2, str(table.cell(row, 3).value))
            sheet2.write(line, 3, str(int(table.cell(row, 4).value)))
            sheet2.write(line, 4, str(int(table.cell(row, 5).value)))
            #sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
            if (isinstance(table.cell(row, 6).value, float)):
                sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
            else:
                sheet2.write(line, 5, str(table.cell(row, 6).value))
            sheet2.write(line, 6, str(int(table.cell(row, 7).value)))
            sheet2.write(line, 7, xlwt.Formula('HYPERLINK("' + link + '";"详细")'))
            line = line + 1

    # 学校整体信息->写入文字,基本信息
    else:
        if (flag_title == 1):#如果是同一学校的其他批次
            sheet1.write(line, 0, str(table.cell(row, 3).value))
            sheet1.write(line, 1, str(int(table.cell(row, 4).value)))
            sheet1.write(line, 2, str(int(table.cell(row, 5).value)))
            if (isinstance(table.cell(row, 8).value, float)):
                sheet1.write(line, 3, str(int(table.cell(row, 8).value)))
            else:
                sheet1.write(line, 3, str(table.cell(row, 8).value))
            #sheet1.write(line, 3, str(int(table.cell(row, 8).value)))
            line = line + 1

        else:#如果是新的学校
            if(school != ""):
                #file.save("C:\\Users\\sypgh\\Desktop\\"+school+".xls")
                file.save("D:\\2019高考数据\\理科专科\\学校\\" + school + ".xls")
            school = table.cell(row, 0).value  # 学校名赋值
            print(str(row) + school)
            line = 1
            flag_title = 1
            flag_in = 1
            #file = docx.Document()
            file = xlwt.Workbook(encoding='utf-8')
            sheet1 = file.add_sheet('院校信息')
            sheet2 = file.add_sheet('专业信息')
            #file.add_paragraph("院校代码及名称:"+school)
            sheet1.write(line, 0, "院校")
            sheet1.write(line, 1, school)
            line = line + 1
            sheet1.write(line, 0, '填报次序')
            sheet1.write(line, 1, '最高分')
            sheet1.write(line, 2, '最低分')
            sheet1.write(line, 3, '总人数')
            line = line + 1
            sheet1.write(line, 0, str(table.cell(row,3).value))
            sheet1.write(line, 1, str(int(table.cell(row,4).value)))
            sheet1.write(line, 2, str(int(table.cell(row,5).value)))
            #sheet1.write(line, 3, str(int(table.cell(row,8).value)))
            if (isinstance(table.cell(row, 8).value, float)):
                sheet1.write(line, 3, str(int(table.cell(row, 8).value)))
            else:
                sheet1.write(line, 3, str(table.cell(row, 8).value))
            line = line + 1

#file.save("C:\\Users\\sypgh\\Desktop\\"+school+".xls")
file.save("D:\\2019高考数据\\理科专科\\学校\\"+school+".xls")






###学校信息摘取部分###
file_path = r'C:\\Users\\sypgh\\Desktop\\高考数据\\2019普文一批.xls'#文件路径
data = xlrd.open_workbook(file_path)#获取数据
table = data.sheet_by_name("1")#获取sheet
rownum = table.nrows#行数
colnum = table.ncols#列数

#逐行处理
flag_title = 0#标记是否是学校基本信息
flag_in = 1#标记是否打专业表头
school = ""#学校名
#file = docx.Document()
file = xlwt.Workbook(encoding='utf-8')
sheet1 = file.add_sheet('院校信息')
#sheet2 = file.add_sheet('专业信息')
line = 1#记录Excel写了多少行
sheet1.write(line, 0, '院校及代码')
sheet1.write(line, 1, '填报次序')
sheet1.write(line, 2, '最高分')
sheet1.write(line, 3, '最低分')
sheet1.write(line, 4, '总人数')
line = line + 1
for row in range(1,rownum):
    print(str(row)+"/"+str(rownum))
    if (table.cell(row,2).value != ""):
        continue
    # 学校整体信息->写入文字,基本信息
    else:
        school = table.cell(row, 0).value  # 学校名赋值
        link = "D:\\2019高考数据\\文科一批\\学校\\" + school + ".xls"
        print(str(row) + school)
        flag_title = 1
        # file = docx.Document()
        # file.add_paragraph("院校代码及名称:"+school)
        sheet1.write(line, 0, school)
        sheet1.write(line, 1, str(table.cell(row, 3).value))
        sheet1.write(line, 2, str(int(table.cell(row, 4).value)))
        sheet1.write(line, 3, str(int(table.cell(row, 5).value)))
        # sheet1.write(line, 3, str(int(table.cell(row,8).value)))
        if (isinstance(table.cell(row, 8).value, float)):
            sheet1.write(line, 4, str(int(table.cell(row, 8).value)))
        else:
            sheet1.write(line, 4, str(table.cell(row, 8).value))
        sheet1.write(line, 5, xlwt.Formula('HYPERLINK("' + link + '";"详细")'))
        line = line + 1

#file.save("C:\\Users\\sypgh\\Desktop\\"+school+".xls")
file.save("D:\\2019高考数据\\理科专科\\"+"文一学校总览"+".xls")