代码语言
.
CSharp
.
JS
Java
Asp.Net
C
MSSQL
PHP
Css
PLSQL
Python
Shell
EBS
ASP
Perl
ObjC
VB.Net
VBS
MYSQL
GO
Delphi
AS
DB2
Domino
Rails
ActionScript
Scala
代码分类
文件
系统
字符串
数据库
网络相关
图形/GUI
多媒体
算法
游戏
Jquery
Extjs
Android
HTML5
菜单
网页交互
WinForm
控件
企业应用
安全与加密
脚本/批处理
开放平台
其它
【
Asp.Net
】
采集程序完整代码
作者:
Dezai.CN
/ 发布于
2012/3/21
/
560
采集程序完整代码
<div>namespace CJ { public partial class Form1 : Form { public int proxy = 0; public int keyi = 0; public int keyj = 0; public int keym = 0; public int keyn = 0; public int sum = 0; public string newurl = ""; public string cururl = ""; public string dirname = ""; public string curdir = ""; public string responseFromServer = ""; public string filename = ""; public string sql = ""; public string mulu = ""; StringBuilder sbs = new StringBuilder(); List<Class1> cls = new List<Class1>(); public ArrayList al = new ArrayList(); public string insertdl = "insert into mzinedl values("; public string insertxl = "insert into mzinexl values("; public string insertinfo = "insert into mzineinfo values("; public string insertwz = "insert into mzinewz values("; public Form1() { InitializeComponent(); } /// <summary> /// 保存网页 /// </summary> /// <param name="FILE_NAME">文件的路径</param> /// <param name="data">数据</param> public void TextToFile(string FILE_NAME, string data) { if (File.Exists(FILE_NAME)) { return; } using (StreamWriter sw = File.CreateText(FILE_NAME)) { sw.Write(data); sw.Close(); } } /// <summary> /// 下载文件 /// </summary> /// <param name="PageUrl">网址</param> /// <param name="filename">保存文件路径</param> public void DownFile(string PageUrl, string filename) { if (!Directory.Exists(filename)) { Directory.CreateDirectory(filename); } string path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1); string dirname = filename + "\\" + path; if (File.Exists(dirname)) { return; } else { try { WebClient wc = new WebClient(); WebProxy wp = new WebProxy(al[proxy].ToString(), true); wc.Proxy = wp; wc.DownloadFile(PageUrl, dirname); } catch (WebException ex) { if (ex.Status == WebExceptionStatus.ConnectFailure) { //无法连接到远程服务器, --换代理 IP //MessageBox.Show(ex.ToString()); proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } DownFile(PageUrl, filename); } else if (ex.Status == WebExceptionStatus.Timeout) { //超时 --换代理 IP //MessageBox.Show(ex.ToString()); proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } DownFile(PageUrl, filename); } else if (ex.Status == WebExceptionStatus.ProtocolError) { //文件未找到--跳出 //MessageBox.Show(ex.ToString()); return; } } } } /// <summary> /// 读文件 /// </summary> /// <param name="FILE_NAME">文件的路径</param> /// <returns>数据</returns> public ArrayList ReadIPproxy(string FILE_NAME) { using (StreamReader sr = File.OpenText(FILE_NAME)) { String input; while ((input = sr.ReadLine()) != null) { al.Add(input); } sr.Close(); } return al; } /// <summary> /// 数据库 /// </summary> public void Executesql() { SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null); } /// <summary> /// 读文件 /// </summary> /// <param name="FILE_NAME">文件的路径</param> /// <returns>数据</returns> public string FileToText(string FILE_NAME) { string data; using (StreamReader sr = File.OpenText(FILE_NAME)) { data=sr.ReadToEnd(); sr.Close(); } return data; } /// <summary> /// 保存SQL /// </summary> /// <param name="sql"></param> public void SaveSqls(string sql) { sbs.Append(sql).Append("\n"); } /// <summary> /// 请求失败的时候,反复操作 /// </summary> /// <param name="PageUrl"></param> /// <returns></returns> public string ToServer(string PageUrl) { string responseFromServer = ""; try { while (1 == 1) { WebRequest request = WebRequest.Create(PageUrl); WebProxy wp = new WebProxy(al[proxy].ToString(), true); request.Proxy = wp; request.Timeout = 1000 * 60; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream dataStream = response.GetResponseStream(); StreamReader reader=null; try { reader = new StreamReader(dataStream, System.Text.Encoding.Default); responseFromServer = reader.ReadToEnd(); } catch { proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } ToServer(PageUrl); }; reader.Close(); dataStream.Close(); response.Close(); if (responseFromServer.Contains("refresh") || responseFromServer == "") { proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } //ToServer(PageUrl); } else { break; } } } catch (WebException ex) { if (ex.Status == WebExceptionStatus.ProtocolError) { responseFromServer = ""; } else { proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } ToServer(PageUrl); } } return responseFromServer; } /// <summary> /// 保存XML 文件 /// </summary> public void SaveXmls() { string pathxml = ""; foreach (Class1 c in cls) { Class1 s = c; pathxml = s.address; if (!File.Exists(pathxml)) { XmlSerializer xs = new XmlSerializer(typeof(Class1)); Stream stream = new FileStream(pathxml, FileMode.Create, FileAccess.Write, FileShare.ReadWrite); xs.Serialize(stream, s); stream.Close(); } } } /// <summary> /// 移除HTMl 标记 /// </summary> /// <param name="Html"></param> /// <param name="RegStr"></param> /// <returns></returns> public static string Remove(string Html) { //Regex Reg = new Regex(RegStr); //foreach (Match m in Reg.Matches(Html)) //{ // Html = Html.Replace(m.Value, ""); //} //return Html.Trim(); string regesstr = "<.*?>"; return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase); } public static string FilterScript(string content) { string regexstr = @"<(script)[^>]*>(\s*|.)*</\1>"; return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase); } /// <summary> /// 过略所有的 危险标记 /// </summary> /// <param name="html"></param> /// <returns></returns> public string wipeScript(string html) { System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"(<script){1,}[^<>]*>[^\0]*(<\/script>){1,}", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@"href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@"on[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); html = regex1.Replace(html, ""); //过滤<script></script>标记 html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性 html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件 html = regex4.Replace(html, ""); //过滤iframe html = regex5.Replace(html, ""); //过滤frameset return html; } public void HtmlSource(string urlpri) { //要写入的文件路径 filename = "E:\\观2\\magazine.html"; if (!Directory.Exists("E:\\观2")) { Directory.CreateDirectory("E:\\观2"); } if (File.Exists(filename)) { responseFromServer=FileToText(filename); //存在 } else { responseFromServer = ToServer(urlpri); //不存在 } sum++; if (responseFromServer != "") { //分析内容 TextToFile(filename,responseFromServer); MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase); foreach (Match m in mc) { newurl = m.Groups[1].Value; dirname = m.Groups[2].Value; int key = ++keyi; sql = insertdl + key + ",'" + dirname + "')"; SaveSqls(sql); cururl = urlpri + newurl; curdir = "E:\\观2\\" + dirname; one(cururl, curdir,key); } SaveXmls(); Executesql(); this.textBox1.Text = sum.ToString(); MessageBox.Show("采集成功!"); } } public void one(string urlpri,string _dirname,int _key) { //要写入的文件路径 filename = _dirname +"\\"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1); if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); MatchCollection mc = Regex.Matches(responseFromServer, @"href=""\.\./(.*list.html)""[\s\S]*?《(.*?)》", RegexOptions.IgnoreCase); foreach (Match m in mc) { newurl = m.Groups[1].Value; dirname = m.Groups[2].Value; cururl = "http://www.zydg.net/magazine/" + newurl; curdir = _dirname + "\\" + dirname; two(cururl, curdir, _key); } } } public void two(string urlpri,string _dirname,int _key) { filename = urlpri.Substring(0, urlpri.LastIndexOf("/")); filename = filename.Substring(filename.LastIndexOf("/") + 1) + ".html"; filename = _dirname + "\\" + filename; if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); Match mc = Regex.Match(responseFromServer, @"刊\s+期:(.*?)[\s\S]*?编\s+辑:(.*?)[\s\S]*?出\s+版:(.*?)[\s\S]*?联系电话:(.*?)[\s\S]*?E-mail:(.*?)[\s\S]*?社\s+址:(.*?)[\s\S]*?邮\s+编:(.*?)[\s\S]*?邮发代号:(.*?)[\s\S]*?国外发行代号:(.*?)[\s\S]*?国际标准刊号:(.*?)[\s\S]*?国内统一刊号:(.*?)</td>", RegexOptions.IgnoreCase); Match content = Regex.Match(responseFromServer, @"刊\s+物\s+简\s+介\s+:::...([\s\S]*?)...:::\s+收录期号列表", RegexOptions.Multiline); int key = ++keyj; sql = insertxl + keyj + "," + _key + ",'" + dirname + "','" + mc.Groups[1].Value + "','" + mc.Groups[2].Value + "','" + mc.Groups[3].Value + "','" + mc.Groups[4].Value + "','" + mc.Groups[5].Value + "','" + mc.Groups[6].Value + "','" + mc.Groups[7].Value + "','" + mc.Groups[8].Value + "','" + mc.Groups[9].Value + "','" + mc.Groups[10].Value + "','" + mc.Groups[11].Value + "','" + Remove(content.Groups[1].Value) + "')"; SaveSqls(sql); MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(.*?)'\s+target.*>(.*?)</a>", RegexOptions.IgnoreCase); foreach (Match m2 in mc2) { newurl = m2.Groups[1].Value; dirname = m2.Groups[2].Value.Replace("年", "-").Replace("第", "").Replace("期", ""); cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl; curdir = _dirname + "\\" + dirname; three(cururl, curdir,key,dirname); } } } public void three(string urlpri,string _dirname,int _key,string qishu) { //要写入的文件路径 filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1); if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); Match m = Regex.Match(responseFromServer, @"src='face_(.*?)'", RegexOptions.IgnoreCase); string photoName = ""; if (m.Groups[1].Value.Trim() != "") { photoName = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) +"face_" + m.Groups[1].Value; DownFile(photoName, _dirname); } int key = ++keym; sql = insertinfo + key + "," + _key + ",'" + qishu + "','" + _dirname +"\\"+ "face_" + m.Groups[1].Value + "')"; SaveSqls(sql); MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(\d+.html?)'[\s\S]*?<font\s+color=black>(.*?)</a>|<font[^>]*?>[(.+?)]", RegexOptions.IgnoreCase); foreach (Match m2 in mc2) { newurl = m2.Groups[1].Value; string muName = m2.Groups[3].Value; if (muName == "") { muName = mulu; } string lstr = m2.Groups[2].Value; string s1 = ""; string s2 = ""; if (lstr != "") { if (lstr.Contains(".")) { s1 = lstr.Substring(0, lstr.IndexOf(".")); s2 = lstr.Substring(lstr.LastIndexOf(".") + 1); } else { s1 = lstr; s2 = ""; } int k2 = ++keyn; sql = insertwz + k2 + "," + key + ",'" + muName + "','" + s1 + "','" + s2 + "')"; SaveSqls(sql); cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl; curdir = _dirname; four(cururl, curdir,k2); } mulu = muName; } } } public void four(string urlpri,string _dirname,int _key) { filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1); if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); //分析内容 Match m = Regex.Match(responseFromServer, @"正文开始-->(?<text>[\s\S]*?)<!--正文结束", RegexOptions.IgnoreCase); string content = m.Groups["text"].Value; //得到正文的所有内容 string c = FilterScript(content); c = Remove(c); //得到过滤后的正文内容 // Match ms = Regex.Match(c, @"正文开始-->(?<text>[\s\S]*?)<!--正文结束", RegexOptions.IgnoreCase); //设置要保存的XML 文件的名称 string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/")); string pathxml = _dirname + "\\" + xmlname + "xml"; //将路径 和名字一起传过去 Class1 cs = new Class1(_key, c, pathxml); cls.Add(cs); //序列化成功 MatchCollection mc = Regex.Matches(responseFromServer, @"(<img\s+src=""(?<imgs>.*)""\s+hspace|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase); foreach (Match m2 in mc) { string imgurl = m2.Groups["imgs"].Value.Trim(); //得到单个图片的名称 string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1); if (imgurl != "") { string jurl = zhuurl + imgurl; //得到图片的绝对路径 DownFile(jurl, _dirname); } string pdfurl = m2.Groups["pdfs"].Value.Trim(); //得到单个PDF 的名称 if (pdfurl != "") { string jurl = zhuurl + pdfurl; //得到 pdf 的绝对路径 DownFile(jurl, _dirname); } } } } private void btnOK_Click(object sender, EventArgs e) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP HtmlSource("http://www.zydg.net/magazine/"); } private void button1_Click(object sender, EventArgs e) { Application.Exit(); } } } </div>
试试其它关键字
采集程序
同语言下
.
gzip压缩
.
实现http多线程断点续传下载文件
.
实现多线程断点续传下载大文件
.
生成字符串的 CheckSum
.
根据 UserAgent 获取浏览器的类型和版本
.
根据 Agent 判断是否是智能手机
.
隐藏手机号中间四位为*方法
.
合并图片(二维码和其他图片合并)
.
ASP.NET CORE中判断是否移动端打开网页
.
ASP.NET(C#)实现页面计时(定时)自动跳转
可能有用的
.
C#实现的html内容截取
.
List 切割成几份 工具类
.
SQL查询 多列合并成一行用逗号隔开
.
一行一行读取txt的内容
.
C#动态修改文件夹名称(FSO实现,不移动文件)
.
c# 移动文件或文件夹
.
c#图片添加水印
.
Java PDF转换成图片并输出给前台展示
.
网站后台修改图片尺寸代码
.
处理大图片在缩略图时的展示
Dezai.CN
贡献的其它代码
(
4037
)
.
多线程Socket服务器模块
.
生成随机密码
.
清除浮动样式
.
弹出窗口居中
.
抓取url的函数
.
使用base HTTP验证
.
div模拟iframe嵌入效果
.
通过header转向的方法
.
Session操作类
.
执行sqlite输入插入操作后获得自动编号的ID
Copyright © 2004 - 2024 dezai.cn. All Rights Reserved
站长博客
粤ICP备13059550号-3