使用lxml.html的cssselect解析HTML
Sunday, April 22, 2012 9:08:47 AM
Code
Output
from lxml.html import parse
from pprint import pprint
dom = parse('http://my.opera.com/gotovoid/blog/').getroot()
tags = {e.find('a').text:int(e.get('class')[-1]) for e in dom.cssselect('div#tagcloud li')}
pprint(sorted(tags.items(), key=lambda x:x[1], reverse=True))
Output
[('encoding', 5),
('python', 5),
('vim', 5),
('windows', 5),
('i18n', 5),
('bash', 5),
('crawler', 5),
('jquery', 4),
('sqlite', 4),
('google', 4),
('webpy', 4),
('sox', 4),
('json', 4),
('firebug', 4),
('linux', 4),
('apache', 4),
('diff', 4),
('api', 1),
('awk', 1),
('array', 1)]


