首先,介绍一下程序所要完成的事情:当下有一个叫做web_pages的文件夹,该文件夹下所有的子文件夹都是以日期命名,比如2016-01-09. 而每一个这样的子文件夹下都有12个html文件。每一个html文件的大小在530kB左右,行数在4000行左右。因为日期子文件夹的个数是36个。所以:
1. 所有文件数目是:36 * 12 = 432
2. 总行数大约是:432 * 4000 = 1728000
3. 总大小为:212 MB
<tr> <td>199</td> <td class="hide-col">11-17</td> <td class="hide-col">160212</td> <td><a href="http://huobijijin.com/jijin/160212" title="国泰估值优势混合(LOF)的历史收益情况" target="_blank">国泰估值优势混合(LOF)</a></td> <td class="thd-text-right">1.9230</td> <td class="thd-text-right"><span class='green'>-1.99%</span></td> <td class="thd-text-right"><span class='green'>-0.67%</span></td> <td class="thd-text-right"><span class='red'>14.53%</span></td> <td class="thd-text-right"><span class='red'>10.2%</span></td> <td class="thd-text-right"><span class='red'>5.43%</span></td> <td class="thd-text-right"><span class='red'>92.3%</span></td> <td class="thd-text-right"><span class='red'>109.48%</span></td> <td class="thd-text-right"><span class='red'>167.08%</span></td> <td class="thd-text-right"><span class='red'>80.56%</span></td> <td class="thd-text-right"><span class='red'>92.3%</span></td> <td class="thd-text-right">2010-02-10</td> <td class="thd-text-right"><a href="http://huobijijin.com/jijin/160212" title="国泰估值优势混合(LOF)的历史收益情况" target="_blank">详情</a></td> </tr>
1. 11-17: 这是日期,至于年份,可以从文件名得出
2. 160212: 这是基金代码
3. 1.9230:这是基金净值
4. 国泰估值优势混合(LOF):这是基金名称
{ 160212: { 'title': '国泰估值优势混合(LOF)', '2015-11-17': 1.923, '2015-11-18': xxx, ... }, ... }
our @dir_set = (); our %total_hash = (); sub get_files($) { ($dir) = @_; opendir(DIR,$dir) || die "Can not open $dir. $!\n"; my @all_entries = grep(!/^\.\.?$/,readdir(DIR)); closedir(DIR); foreach my $entry (@all_entries) { $entry=join('/',($dir,$entry)); if(-d $entry) { push(@dir_set, $entry); } elsif(-f $entry) { read_file($entry); } } my $size = @dir_set; if ($size != 0) { my $dir_entry = pop(@dir_set); get_files($dir_entry); } } sub read_file($) { ($PathName) = @_; my $year = '????'; if ($PathName =~ /.+?\/(\d{4})-\d{2}-\d{2}_.+/) { $year = $1; } open(FILEHANDLE, $PathName) || die "Can't open $PathName: $@."; my @content = <FILEHANDLE>; close(FILEHANDLE); my $size = @content; my $hit_flag = 0; my $i = 0; while ($i < $size) { my $line = $content[$i]; chomp($line); my ($date, $id, $title, $value); if ($line =~ /^\s*\<tr\>\s*/) { # <tr> if ($content[$i + 2] =~ /\s*\<td.*?\>(.+)\<\/td\>/) { $date = "${year}-$1"; } else { $i += 2; next; } if ($content[$i + 3] =~ /\s*\<td.*?\>(\d+)\<\/td\>/) { $id = $1; } else { $i += 3; next; } if ($content[$i + 4] =~ /\s*\<td.*?\>.*?title=\"(.+?)的历史收益情况\".+?\<\/td\>/) { $title = $1; } else { $i += 4; next; } if ($content[$i + 5] =~ /\s*\<td.*?\>(.+)\<\/td\>/) { $value = $1; } else { $i += 5; next; } $hit_flag = 1; $i += 6; } else { $hit_flag = 0; $i++; } if ($hit_flag) { if (exists $total_hash{$id}) { $total_hash{$id}{$date} = $value; } else { my %temp_hash = (); $total_hash{$id} = \%temp_hash; $total_hash{$id}{'title'} = $title; $total_hash{$id}{$date} = $value; } } } } sub print_total_hash() { foreach my $key (sort keys %total_hash) { print $key, "\n"; if (exists $total_hash{$key}{'title'}) { print $total_hash{$key}{'title'}, "\n"; } foreach my $date (sort keys $total_hash{$key}) { if ($date != 'title') { print " ${date}: $total_hash{$key}{$date}\n"; } } print "\n"; } } ##### Main Body ##### get_files("./web_pages"); # print_total_hash();
# coding=utf-8 import re import os file_path_mode = re.compile(r'.*?(\d{4})-\d{2}-\d{2}') tr_mode = re.compile(r'\s*<tr>\s*') date_mode = re.compile(r'\s*<td.*?>(.+)\</td>') id_mode = re.compile(r'\s*<td.*?>(\d+)</td>') title_mode = re.compile(r'\s*<td.*?>.*title="(.+?)的历史收益情况".+</td>') value_mode = re.compile(r'\s*<td.*?>(.+)</td>') total_hash = {} def analyze_file(file_path): try: year = '????' if file_path_mode.match(file_path): res = file_path_mode.search(file_path).groups() if res and res[0]: year = res[0] except Exception as ex: print "Error happened when parsing file path: %s" % str(ex) try: with open(file_path, "r") as infile: lines = infile.read().split('\n') except Exception as ex: print "Error happened when open file: " + str(ex) raise hit_flag = 0 i = 0 while i < len(lines): date, id, title, value = '????', 'id', 'title', 0 try: if tr_mode.match(lines[i]): res = date_mode.search(lines[i+2]).groups() if res and res[0]: date = "%s-%s" % (year, res[0]) else: i += 2 continue res = id_mode.search(lines[i+3]).groups() if res and res[0]: id = res[0] else: i += 3 continue res = title_mode.search(lines[i+4]).groups() if res and res[0]: title = res[0] else: i += 4 continue res = value_mode.search(lines[i+5]).groups() if res and res[0]: value = res[0] else: i += 5 continue hit_flag = 1 i += 6 else: i += 1 hit_flag = 0 except Exception as ex: # print "Error happened in one loop: " + str(ex) i += 1 if hit_flag: if total_hash.get(id) is None: total_hash[id] = {} total_hash[id]['title'] = title if total_hash[id].get(date) is None: total_hash[id][date] = value def list_files(target_dir): for root, dirs, files in os.walk(target_dir): for file in files: file_path = os.path.join(root, file) analyze_file(file_path) for dir in dirs: list_files(dir) def print_total_hash(): for id in sorted(total_hash.keys()): print id print total_hash[id]['title'] for date in sorted(total_hash[id].keys()): if date != 'title': print " %s: %s" % (date, total_hash[id][date]) print if __name__ == "__main__": list_files('./web_pages') # print_total_hash()
1. 硬件:8核,16GB 内存的强机 - HP的ZBook15 (售价2.5万-3万哦,当然,这是公司电脑 :P)
2. 操作系统: Windows 7
3. 执行环境:Cygwin
4. Perl 版本: 5.22
5. Python版本:2.7.10
1. 从real time来看,Python程序比Perl程序快了 (18.539 - 17.317) / 18.539 * 100% = 6.6%
2. 从user time来看,Python程序比Perl程序快了 (15.487-13.739)/15.487 * 100% = 11.3%
1. 开发效率:Python 胜 Perl
2. 普通代码可读性: Python 胜 Perl
3. 正则表达式写法: Python 负 Perl
4. 综合执行效率: Python 胜 Perl
5. 代码行数: Python 胜 Perl
Python 4:1 Perl
Python胜出!-- 嗯,我决定以后只写Python不写Perl了。所谓“人生苦短,我用Python”,还真是有道理的!