# -*- coding: gbk -*-
import urllib2
from sgmllib import SGMLParser
class HotExtract(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.is_a = ""
self.hot = []
def start_a(self, attrs):
if len(attrs) == 0:
pass
else:
for (variable, value) in attrs:
if variable == "mon" and value == "ct=1&a=30":
self.is_a = 1
break
def end_a(self):
self.is_a = ""
def handle_data(self, text):
if self.is_a == 1:
self.hot.append(text)
def getHtml(url):
html = urllib2.urlopen(url).read()
return html
def extract_hot(html):
hotExtract = HotExtract()
hotExtract.feed(html)
return hotExtract.hot
html = getHtml("http://news.baidu.com/")
hot_list = extract_hot(html)
for hot in hot_list:
print hot