之前看了很多获取网页源码的写法,要么有乱码,要么没考虑到gzip等压缩,比如有些网页这个HttpWebResponse 对象的CharacterSet是iso-8859-1,那么这种情况
我们根据它的charset读取。
还有个将流转成byte[] 数组的。。。发现GetResponseStream() 的流不能获取length属性
以下是源码
public static HttpWebResponse GetResponse(string url, string method, string data = null, string reffer = null, CookieContainer cookieContainer = null) { //System.Net.ServicePointManager.Expect100Continue = true; HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); // 需要注意的: //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 //这是就要具体问题具体分析比如在头部加入cookie // req.Headers.Add("Cookie", cookie); //这样可能需要一些重载方法。根据需要写就可以了 //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 req.Credentials = CredentialCache.DefaultCredentials; //如果服务器要验证用户名,密码 //NetworkCredential mycred = new NetworkCredential(struser, strpassword); //myWebClient.Credentials = mycred; //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) //req.Proxy = null; //解决.net 4 第一次请求慢的问题 req.Proxy = GetIEProxy(url); req.KeepAlive = true; req.Method = method.ToUpper(); req.AllowAutoRedirect = true; req.ContentType = "application/x-www-form-urlencoded"; req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; req.Headers.Add("Accept-Encoding", "gzip,deflate"); //req.Connection = "keep-alive"; req.Timeout = 10000; req.Referer = reffer; if (cookieContainer != null) { req.CookieContainer = cookieContainer; } else { } req.UserAgent = IE9; if (method.ToUpper() == "POST" && data != null) { byte[] postBytes = Encoding.UTF8.GetBytes(data); ; req.ContentLength = postBytes.Length; Stream st = req.GetRequestStream(); st.Write(postBytes, 0, postBytes.Length); st.Close(); } System.Net.ServicePointManager.ServerCertificateValidationCallback += (se, cert, chain, sslerror) =>{ return true; }; HttpWebResponse res = (HttpWebResponse)req.GetResponse(); return res; } /** * 用getBytes(encoding):返回字符串的一个byte数组 * 当b[0]为 63时,应该是转码错误 * A、不乱码的汉字字符串: * 1、encoding用GB2312时,每byte是负数; * 2、encoding用ISO8859_1时,b[i]全是63。 * B、乱码的汉字字符串: * 1、encoding用ISO8859_1时,每byte也是负数; * 2、encoding用GB2312时,b[i]大部分是63。 * C、英文字符串 * 1、encoding用ISO8859_1和GB2312时,每byte都大于0; ** 总结:给定一个字符串,用getBytes("iso8859_1") * 1、如果b[i]有63,不用转码; A-2 * 2、如果b[i]全大于0,那么为英文字符串,不用转码; B-1 * 3、如果b[i]有小于0的,那么已经乱码,要转码。 C-1 */ public static String GetResponseString(string url, string method, string data = null, string reffer = null, CookieContainer cookieContainer = null) { HttpWebResponse res = GetResponse(url, method, data, reffer, cookieContainer); Stream receiveStream = res.GetResponseStream(); if (res.ContentEncoding.ToLower().Contains("gzip")) { receiveStream = new GZipStream(receiveStream, CompressionMode.Decompress); } else if (res.ContentEncoding.ToLower().Contains("deflate")) { receiveStream = new DeflateStream(receiveStream, CompressionMode.Decompress); } //获取网页字符编码描述信息 //Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline); //string webCharSet = charSetMatch.Groups[2].Value.Trim('\"'); Encoding encode = Encoding.UTF8; if (res.CharacterSet != null && res.CharacterSet != "") { encode = Encoding.GetEncoding(res.CharacterSet); } StreamReader sr = new StreamReader(receiveStream, encode); string result = sr.ReadToEnd(); sr.Close(); if (res.CharacterSet != null && res.CharacterSet.ToLower() == "iso-8859-1") { Match charSetMatch = Regex.Match(result, "<meta.*?charset=(.*?)[\"|']", RegexOptions.IgnoreCase | RegexOptions.Multiline); if (charSetMatch.Success) { string webCharSet = charSetMatch.Groups[1].Value.Replace("\"", "").Replace("'", "").Trim(); if (Encoding.GetEncoding(webCharSet) != encode) { // result = Encoding.GetEncoding(webCharSet).GetString(buffer); sr = new StreamReader(receiveStream, Encoding.GetEncoding(webCharSet)); result = sr.ReadToEnd(); sr.Close(); } } } receiveStream.Close(); res.Close(); return result; }