心血来潮,想研究下爬虫,爬点小说。
通过百度选择了个小说网站,随便找了一本小书。
1、分析html规则思路是获取小说章节目录,循环目录,抓取所有章节中的内容,拼到txt文本中。最后形成完本小说。
1、获取小说章节目录
通过分析,我在标注的地方获取小说名字及章节目录。
<meta name=content=/>// 获取小说名字 <table cellspacing=cellpadding=bgcolor=id=>// 所有的章节都在这个table中。
下面是利用正则,获取名字与目录。
// 获取小说名字 Match ma_name = Regex.Match(html, ); )[0]; // 获取章节目录 Regex reg_mulu = ); var mat_mulu = reg_mulu.Match(html); string mulu = mat_mulu.Groups[0].ToString();
2、获取小说正文内容
通过章节a标签中的url地址,查看章节内容。
通过分析,正文内容在<dd>中。
// 获取正文 Regex reg = ); MatchCollection mc = reg.Matches(html_z); var mat = reg.Match(html_z); , , , , );
2、C#完整代码
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.Mvc;
namespace Test.Controllers
{
public class CrawlerController : BaseController
{
Index()
{
//抓取整本小说
CrawlerController cra = html = cra.HttpGet(, "");
// 获取小说名字
Match ma_name = Regex.Match(html, );
)[0];
// 获取章节目录
Regex reg_mulu = );
var mat_mulu = reg_mulu.Match(html);
string mulu = mat_mulu.Groups[0].ToString();
// 匹配a标签里面的url
Regex tmpreg = , RegexOptions.Compiled);
MatchCollection sMC = tmpreg.Matches(mulu);
if (sMC.Count != 0)
{
(int i = 0; i < sMC.Count; i++)
{
//sMC[i].Groups[1].Value
">第一章 泰山之巅</a>
//2是第一章 泰山之巅
title = sMC[i].Groups[2].Value;
html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");
// 获取小说名字,章节中也可以查找名字
//Match ma_name = Regex.Match(html, @"<metakeywords"".+content=""(.+)"" />");
//string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
// 获取标题,通过分析h1标签也可以得到章节标题
//string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1];
// 获取正文
Regex reg = );
MatchCollection mc = reg.Matches(html_z);
var mat = reg.Match(html_z);
, , , , );
path = AppDomain.CurrentDomain.BaseDirectory.Replace(, ) + ;
Novel(title + + content, name, path);
}
}
}
创建文本
Novel(string content, string name, string path)
{
;
(Directory.Exists(path) == false)
{
Directory.CreateDirectory(path);
}
(!System.IO.File.Exists(path + name + ))
{
FileStream fs1 = , FileMode.Create, FileAccess.Write);// 创建写入文件
StreamWriter sw = new StreamWriter(fs1);
sw.WriteLine(Log);// 开始写入值
sw.Close();
fs1.Close();
}
else
{
FileStream fs = + "", FileMode.Append, FileAccess.Write);
StreamWriter sr = new StreamWriter(fs);
sr.WriteLine(Log);// 开始写入值
sr.Close();
fs.Close();
}
}
public string HttpPost(string Url, string postDataStr)
{
CookieContainer cookie = new CookieContainer();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = ;
request.ContentType = ;
request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
request.CookieContainer = cookie;
Stream myRequestStream = request.GetRequestStream();
StreamWriter myStreamWriter = ));
myStreamWriter.Write(postDataStr);
myStreamWriter.Close();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
response.Cookies = cookie.GetCookies(response.ResponseUri);
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = ));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}
public string HttpGet(string Url, string postDataStr)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == ) + postDataStr);
request.Method = ;
HttpWebResponse response;
request.ContentType = ;
try
{
response = (HttpWebResponse)request.GetResponse();
}
catch (WebException ex)
{
response = (HttpWebResponse)request.GetResponse();
}
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = ));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}
}
}