Ip代理对象 IpProxy 5 { IP地址 Address { get; set; } 端口 Port { get; set; } 15 }
然后实现一个基于Redis的Ip代理池操作服务:
基于Redis的代理池管理服务 PoolManageService 5 { 从代理池随机获取一条代理 GetProxy() 11 { 12 string result = string.Empty; { 16 result = RedisManageService.GetRandomItemFromSet(RedisSetNameEnum.ProxyPool); 17 if (result != null) 18 { 19 if ( })[0], })[1]))) 22 { 23 DeleteProxy(result); 24 return GetProxy(); 25 } 26 } 27 } 28 catch (Exception e) 29 { , e)); 31 } 32 return result; 33 } 从代理池删除一条代理 DeleteProxy(string value) 40 { { 43 RedisManageService.RemoveItemFromSet(RedisSetNameEnum.ProxyPool, value); 44 } 45 catch (Exception e) 46 { , e)); 48 } 49 } 添加一条代理到代理池 Add(IpProxy proxy) 56 { { 59 if (HttpHelper.IsAvailable(proxy.Address, proxy.Port)) 60 { + proxy.Port.ToString()); 62 } 63 } 64 catch (Exception e) 65 { , e)); 67 } 68 } 69 }
提供简易的三个方法:添加代理IP、删除代理IP、随机获取一条代理IP
我们还需要一个爬虫服务,来爬取我们需要的免费代理IP数据:
IP池 抓取蜘蛛 3 /// TODO:代理池站点变化较快,时常关注日志监控 IpPoolSpider 6 { Initial() 8 { 9 ThreadPool.QueueUserWorkItem(Downloadproxy360); 10 ThreadPool.QueueUserWorkItem(DownloadproxyBiGe); 11 ThreadPool.QueueUserWorkItem(Downloadproxy66); 12 ThreadPool.QueueUserWorkItem(Downloadxicidaili); 13 } Downloadxicidaili(object DATA) 17 { { 20 List<string> list = new List<string>() 21 { , , , }; 28 foreach (var utlitem in list) 29 { 30 for (int i = 1; i < 5; i++) 31 { 32 string url = utlitem + i.ToString(); 33 var ipProxy = PoolManageService.GetProxy(); 34 if (string.IsNullOrEmpty(ipProxy)) 35 { )); 37 return; 38 } 39 var ip = ipProxy; 40 WebProxy webproxy; )) 42 { })[0]; })[1]); 45 webproxy = new WebProxy(ip, port); 46 } { 49 webproxy = new WebProxy(ip); 50 } 51 string html = HttpHelper.DownloadHtml(url, webproxy); 52 if (string.IsNullOrEmpty(html)) 53 { + url + )); 55 continue; 56 } 57 58 HtmlDocument doc = new HtmlDocument(); 59 doc.LoadHtml(html); 60 HtmlNode node = doc.DocumentNode; ; 62 HtmlNodeCollection collection = node.SelectNodes(xpathstring); 63 foreach (var item in collection) 64 { 65 var proxy = new IpProxy(); ; 67 proxy.Address = item.SelectSingleNode(xpath).InnerHtml; ; 69 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 70 Task.Run(() => 71 { 72 PoolManageService.Add(proxy); 73 }); 74 } 75 } 76 } 77 } 78 catch (Exception e) 79 { , e)); 81 } 82 } Downkuaidaili(object DATA) 86 { { ; 90 for (int i = 1; i < 4; i++) 91 { , null); ; 94 HtmlDocument doc = new HtmlDocument(); 95 doc.LoadHtml(html); 96 HtmlNode node = doc.DocumentNode; 97 HtmlNodeCollection collection = node.SelectNodes(xpath); 98 foreach (var item in collection) 99 { 100 var proxy = new IpProxy(); 101 proxy.Address = item.FirstChild.InnerHtml; ; 103 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 104 Task.Run(() => 105 { 106 PoolManageService.Add(proxy); 107 }); 108 } 109 } 110 } 111 catch (Exception e) 112 { , e)); 114 } 115 } Downloadproxy360(object DATA) 119 { { ; 123 string html = HttpHelper.DownloadHtml(url, null); 124 if (string.IsNullOrEmpty(html)) 125 { + url + )); 127 return; 128 } 129 HtmlDocument doc = new HtmlDocument(); 130 doc.LoadHtml(html); ; 132 HtmlNode node = doc.DocumentNode; 133 HtmlNodeCollection collection = node.SelectNodes(xpathstring); (var item in collection) 136 { 137 var proxy = new IpProxy(); 138 var childnode = item.ChildNodes[1]; ; 140 proxy.Address = childnode.SelectSingleNode(xpathstring).InnerHtml.Trim(); ; 142 proxy.Port = int.Parse(childnode.SelectSingleNode(xpathstring).InnerHtml); 143 Task.Run(() => 144 { 145 PoolManageService.Add(proxy); 146 }); 147 } 148 } 149 catch (Exception e) 150 { , e)); 152 } 153 } DownloadproxyBiGe(object DATA) 157 { { 160 List<string> list = new List<string>() 161 { , , , }; 167 foreach (var utlitem in list) 168 { 169 for (int i = 1; i < 5; i++) 170 { 171 string url = String.Format(utlitem, i); 172 string html = HttpHelper.DownloadHtml(url, null); 173 if (string.IsNullOrEmpty(html)) 174 { + url + )); 176 continue; 177 } 178 179 HtmlDocument doc = new HtmlDocument(); 180 doc.LoadHtml(html); 181 HtmlNode node = doc.DocumentNode; ; 183 HtmlNodeCollection collection = node.SelectNodes(xpathstring); 184 foreach (var item in collection) 185 { 186 var proxy = new IpProxy(); ; 188 proxy.Address = item.SelectSingleNode(xpath).InnerHtml; ; 190 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 191 Task.Run(() => 192 { 193 PoolManageService.Add(proxy); 194 }); 195 } 196 } 197 } 198 } 199 catch (Exception e) 200 { , e)); 202 } 203 } Downloadproxy66(object DATA) 207 { { 210 List<string> list = new List<string>() 211 { , , }; 216 foreach (var utlitem in list) 217 { 218 string url = utlitem; 219 string html = HttpHelper.DownloadHtml(url, null); 220 if (string.IsNullOrEmpty(html)) 221 { + url + )); 223 break; 224 } 225 226 HtmlDocument doc = new HtmlDocument(); 227 doc.LoadHtml(html); 228 HtmlNode node = doc.DocumentNode; ; 230 HtmlNodeCollection collection = node.SelectNodes(xpathstring); 231 foreach (var item in collection) 232 { 233 var proxy = new IpProxy(); ; 235 proxy.Address = item.SelectSingleNode(xpath).InnerHtml; )) 237 { 238 continue; 239 } ; 241 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 242 Task.Run(() => 243 { 244 PoolManageService.Add(proxy); 245 }); 246 } 247 } 248 } 249 catch (Exception e) 250 { , e)); 252 } 253 } 254 }
这段代码也没什么营养,就不仔细解释了。
前面有说到,博主的爬虫服务都是以windows服务的方式部署的。以前一直用Timer来实现固定间隔多次循环,这次博主引用了Quartz.NET任务调度框架来做,代码看起来更优美一点。
Quartz.NET可直接在NuGet下载安装。
先写一个代理池的总调度任务类ProxyPoolTotalJob,继承IJob接口: