python基础
模块导入
根据需要添加相应模块
from bs4 import BeautifulSoup
from lxml import html
from w3lib import html
from lxml import etree
from docx import Document
from w3lib.html import remove_tags
from docx.shared import Inches
import xml
import requests
import docx
import xlwt
import openpyxl
import xlrd
出现如下情况
导入过程:
打开控制台cmd
cd /d +python pip所在文件路径切换文件夹
模块导入 pip install + 模块名
pycharm没有自动添加时:
File-Settings-Project worms-ProjectInterpreter 点击右侧加号“+”手动选取,选择后点击Install Package添加即可。
爬虫部分
代码包括:python写word文档(文字、表格、选择表格样式等)
表格样式选择:
###爬虫部分###
###读入全部网址缓存
data = []
###从文档中读取已存储的网址链接
for line in open("C:\\Users\\sypgh\\Desktop\\http.txt","r"):
data.append(line)
i = 0
for sen in data:
###页面读取
url = data[i]
i = i+1
if(url != ""):
f = requests.get(url)
soup = BeautifulSoup(f.content,"lxml")
#print(f.content.decode())
###基本信息读取
m = soup.find('font',size='2')
#print(m.get_text())
sta = m.get_text()
###字符串处理
school = ""#学校名称
major = ""#专业名称
pici = ""#批次 暂时没用
kelei = ""#科类 暂时没用
num = 0
for j in sta:
#if(num == 1):
# pici = pici + j;
#if(num == 3):
# kelei = kelei + j;
if(num == 5 and j!='\n'):
school = school + j;
if(num == 7 and j!='\n'):
major = major + j;
if (j == '\n'):
num = num + 1;
print(str(i)+"/3250"+"school:"+school+"major:"+major)
###文档写入
file = docx.Document()
file.add_paragraph(sta)
table = file.add_table(1,7,style="Light List Accent 3")
heading_cells = table.rows[0].cells
heading_cells[0].text = u'考生号'
heading_cells[1].text = u'姓名'
heading_cells[2].text = u'投档成绩'
heading_cells[3].text = u'录取方式'
heading_cells[4].text = u'录取志愿'
heading_cells[5].text = u'计划性质'
heading_cells[6].text = u'录取时间'
#print()
###学生信息读取处理
l=0
for k in soup.find_all('tr'):
a = k.find_all('td')
if(l!=0):
cells = table.add_row().cells
cells[0].text = a[0].string
cells[1].text = a[1].string
cells[2].text = a[2].string
cells[3].text = a[3].string
cells[4].text = a[4].string
cells[5].text = a[5].string
cells[6].text = a[6].string
#print(a[0].string+' '+a[1].string+' '+a[2].string+' '+a[3].string+' '+a[4].string+' '+' '+a[5].string+' '+a[6].string)
#person = a[0].string+' '+a[1].string+' '+a[2].string+' '+a[3].string+' '+a[4].string+' '+' '+a[5].string+' '+a[6].string
#file.add_paragraph(person)
l = l+1
###文档存储
file.save("D:\\2019高考数据\\理科专科\\专业\\"+school+major+".docx")
#sta存储学校专业批次等基本信息
#person是【局部变量】,存储学生的个人信息
文档处理部分
offfice操作
竟然不会,竟然可以这样系列
1.提取excel表格中的超链接
如图所示的表格中,蓝色部分插入了超链接,现在需要把这些超链接表示的网址提取出来。
Alt F11打开界面,插入-模块中输入代码
Sub test()
For Each cell In Range("C1:C3989")
On Error Resume Next
cell.Offset(0, 10).Value = cell.Hyperlinks(1).Address
Next
End Sub
C1:C3989表示超链接存在的行列范围,根据需求改变
On Error Resume Next表示跳过错误行,可以自动跳过不处理空行(之前由于存在空行会报错)
cell.Offset(0, 10).Value括号中的数值表示新产生的链接与当前表格元素的相对位置,我把它设置在了右侧10列,可以根据需要修改。
输入代码后F5运行即可
结果如图所示,右侧出现了提取出来的连接
2.Excel去除空行(整行)
ctrl+g打开定位,选择定位条件,选择空值,删除整行(下方单元格上移)即可
python操作word和Excel
代码包括:python读Excel表格,Python写Excel表格,Python写入Excel表格超链接等
###文档处理部分###
file_path = r'C:\\Users\\sypgh\\Desktop\\2019.xls'#文件路径
data = xlrd.open_workbook(file_path)#获取数据
table = data.sheet_by_name("普理专科")#获取sheet
rownum = table.nrows#行数
colnum = table.ncols#列数
#逐行处理
flag_title = 0#标记是否是学校基本信息
flag_in = 1#标记是否打专业表头
school = ""#学校名
#file = docx.Document()
file = xlwt.Workbook(encoding='utf-8')
sheet1 = file.add_sheet('院校信息')
sheet2 = file.add_sheet('专业信息')
line = 1#记录Excel写了多少行
#table_school = file.add_table(1, 4, style="Light List Accent 3")#填报次序 最高分 最低分 总人数
#table_major = file.add_table(1, 7, style="Light List Accent 3")#专业代号 专业名称 填报次序 最高分 最低分 最低分位次 录取人数
for row in range(1,rownum):
print(str(row)+"/"+str(rownum))
# 内部专业信息->写入表格,做链接
if (table.cell(row,2).value != ""):
flag_title = 0
code = ""
if (isinstance(table.cell(row, 1).value, float)):
code = str(int(table.cell(row, 1).value))
else:
code = str(table.cell(row, 1).value)
code = code.zfill(2)
link = "D:\\2019高考数据\\理科专科\\专业\\"+school+code+str(table.cell(row, 2).value)+".docx"
#print(link)
if(flag_in == 1):
flag_in = 0
line = 1
#file.add_paragraph("学校专业志愿填报信息:")
sheet2.write(line, 0, "专业代号")
sheet2.write(line, 1, "专业名称")
sheet2.write(line, 2, "填报次序")
sheet2.write(line, 3, "最高分")
sheet2.write(line, 4, "最低分")
sheet2.write(line, 5, "最低分位次")
sheet2.write(line, 6, "录取人数")
line = line + 1
if (isinstance(table.cell(row, 1).value, float)):
sheet2.write(line, 0, str(int(table.cell(row, 1).value)))
else:
sheet2.write(line, 0, str(table.cell(row, 1).value))
sheet2.write(line, 1, str(table.cell(row, 2).value))
sheet2.write(line, 2, str(table.cell(row, 3).value))
sheet2.write(line, 3, str(int(table.cell(row, 4).value)))
sheet2.write(line, 4, str(int(table.cell(row, 5).value)))
if (isinstance(table.cell(row, 6).value, float)):
sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
else:
sheet2.write(line, 5, str(table.cell(row, 6).value))
#sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
sheet2.write(line, 6, str(int(table.cell(row, 7).value)))
sheet2.write(line, 7, xlwt.Formula('HYPERLINK("'+link+'";"详细")'))
line = line + 1
else:
if (isinstance(table.cell(row, 1).value, float)):
sheet2.write(line, 0, str(int(table.cell(row, 1).value)))
else:
sheet2.write(line, 0, str(table.cell(row, 1).value))
sheet2.write(line, 1, str(table.cell(row, 2).value))
sheet2.write(line, 2, str(table.cell(row, 3).value))
sheet2.write(line, 3, str(int(table.cell(row, 4).value)))
sheet2.write(line, 4, str(int(table.cell(row, 5).value)))
#sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
if (isinstance(table.cell(row, 6).value, float)):
sheet2.write(line, 5, str(int(table.cell(row, 6).value)))
else:
sheet2.write(line, 5, str(table.cell(row, 6).value))
sheet2.write(line, 6, str(int(table.cell(row, 7).value)))
sheet2.write(line, 7, xlwt.Formula('HYPERLINK("' + link + '";"详细")'))
line = line + 1
# 学校整体信息->写入文字,基本信息
else:
if (flag_title == 1):#如果是同一学校的其他批次
sheet1.write(line, 0, str(table.cell(row, 3).value))
sheet1.write(line, 1, str(int(table.cell(row, 4).value)))
sheet1.write(line, 2, str(int(table.cell(row, 5).value)))
if (isinstance(table.cell(row, 8).value, float)):
sheet1.write(line, 3, str(int(table.cell(row, 8).value)))
else:
sheet1.write(line, 3, str(table.cell(row, 8).value))
#sheet1.write(line, 3, str(int(table.cell(row, 8).value)))
line = line + 1
else:#如果是新的学校
if(school != ""):
#file.save("C:\\Users\\sypgh\\Desktop\\"+school+".xls")
file.save("D:\\2019高考数据\\理科专科\\学校\\" + school + ".xls")
school = table.cell(row, 0).value # 学校名赋值
print(str(row) + school)
line = 1
flag_title = 1
flag_in = 1
#file = docx.Document()
file = xlwt.Workbook(encoding='utf-8')
sheet1 = file.add_sheet('院校信息')
sheet2 = file.add_sheet('专业信息')
#file.add_paragraph("院校代码及名称:"+school)
sheet1.write(line, 0, "院校")
sheet1.write(line, 1, school)
line = line + 1
sheet1.write(line, 0, '填报次序')
sheet1.write(line, 1, '最高分')
sheet1.write(line, 2, '最低分')
sheet1.write(line, 3, '总人数')
line = line + 1
sheet1.write(line, 0, str(table.cell(row,3).value))
sheet1.write(line, 1, str(int(table.cell(row,4).value)))
sheet1.write(line, 2, str(int(table.cell(row,5).value)))
#sheet1.write(line, 3, str(int(table.cell(row,8).value)))
if (isinstance(table.cell(row, 8).value, float)):
sheet1.write(line, 3, str(int(table.cell(row, 8).value)))
else:
sheet1.write(line, 3, str(table.cell(row, 8).value))
line = line + 1
#file.save("C:\\Users\\sypgh\\Desktop\\"+school+".xls")
file.save("D:\\2019高考数据\\理科专科\\学校\\"+school+".xls")
###学校信息摘取部分###
file_path = r'C:\\Users\\sypgh\\Desktop\\高考数据\\2019普文一批.xls'#文件路径
data = xlrd.open_workbook(file_path)#获取数据
table = data.sheet_by_name("1")#获取sheet
rownum = table.nrows#行数
colnum = table.ncols#列数
#逐行处理
flag_title = 0#标记是否是学校基本信息
flag_in = 1#标记是否打专业表头
school = ""#学校名
#file = docx.Document()
file = xlwt.Workbook(encoding='utf-8')
sheet1 = file.add_sheet('院校信息')
#sheet2 = file.add_sheet('专业信息')
line = 1#记录Excel写了多少行
sheet1.write(line, 0, '院校及代码')
sheet1.write(line, 1, '填报次序')
sheet1.write(line, 2, '最高分')
sheet1.write(line, 3, '最低分')
sheet1.write(line, 4, '总人数')
line = line + 1
for row in range(1,rownum):
print(str(row)+"/"+str(rownum))
if (table.cell(row,2).value != ""):
continue
# 学校整体信息->写入文字,基本信息
else:
school = table.cell(row, 0).value # 学校名赋值
link = "D:\\2019高考数据\\文科一批\\学校\\" + school + ".xls"
print(str(row) + school)
flag_title = 1
# file = docx.Document()
# file.add_paragraph("院校代码及名称:"+school)
sheet1.write(line, 0, school)
sheet1.write(line, 1, str(table.cell(row, 3).value))
sheet1.write(line, 2, str(int(table.cell(row, 4).value)))
sheet1.write(line, 3, str(int(table.cell(row, 5).value)))
# sheet1.write(line, 3, str(int(table.cell(row,8).value)))
if (isinstance(table.cell(row, 8).value, float)):
sheet1.write(line, 4, str(int(table.cell(row, 8).value)))
else:
sheet1.write(line, 4, str(table.cell(row, 8).value))
sheet1.write(line, 5, xlwt.Formula('HYPERLINK("' + link + '";"详细")'))
line = line + 1
#file.save("C:\\Users\\sypgh\\Desktop\\"+school+".xls")
file.save("D:\\2019高考数据\\理科专科\\"+"文一学校总览"+".xls")