之前看了很多获取网页源码的写法,要么有乱码,要么没考虑到gzip等压缩,比如有些网页这个HttpWebResponse 对象的CharacterSet是iso-8859-1,那么这种情况
我们根据它的charset读取。
还有个将流转成byte[] 数组的。。。发现GetResponseStream() 的流不能获取length属性
以下是源码
public static HttpWebResponse GetResponse(string url, string method, string data = null, string reffer = null, CookieContainer cookieContainer = null)
{
//System.Net.ServicePointManager.Expect100Continue = true;
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
// 需要注意的:
//有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
//这是就要具体问题具体分析比如在头部加入cookie
// req.Headers.Add("Cookie", cookie);
//这样可能需要一些重载方法。根据需要写就可以了
//获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
req.Credentials = CredentialCache.DefaultCredentials;
//如果服务器要验证用户名,密码
//NetworkCredential mycred = new NetworkCredential(struser, strpassword);
//myWebClient.Credentials = mycred;
//从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
//req.Proxy = null; //解决.net 4 第一次请求慢的问题
req.Proxy = GetIEProxy(url);
req.KeepAlive = true;
req.Method = method.ToUpper();
req.AllowAutoRedirect = true;
req.ContentType = "application/x-www-form-urlencoded";
req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
req.Headers.Add("Accept-Encoding", "gzip,deflate");
//req.Connection = "keep-alive";
req.Timeout = 10000;
req.Referer = reffer;
if (cookieContainer != null)
{
req.CookieContainer = cookieContainer;
}
else
{
}
req.UserAgent = IE9;
if (method.ToUpper() == "POST" && data != null)
{
byte[] postBytes = Encoding.UTF8.GetBytes(data); ;
req.ContentLength = postBytes.Length;
Stream st = req.GetRequestStream();
st.Write(postBytes, 0, postBytes.Length);
st.Close();
}
System.Net.ServicePointManager.ServerCertificateValidationCallback += (se, cert, chain, sslerror) =>{
return true;
};
HttpWebResponse res = (HttpWebResponse)req.GetResponse();
return res;
}
/**
* 用getBytes(encoding):返回字符串的一个byte数组
* 当b[0]为 63时,应该是转码错误
* A、不乱码的汉字字符串:
* 1、encoding用GB2312时,每byte是负数;
* 2、encoding用ISO8859_1时,b[i]全是63。
* B、乱码的汉字字符串:
* 1、encoding用ISO8859_1时,每byte也是负数;
* 2、encoding用GB2312时,b[i]大部分是63。
* C、英文字符串
* 1、encoding用ISO8859_1和GB2312时,每byte都大于0;
** 总结:给定一个字符串,用getBytes("iso8859_1")
* 1、如果b[i]有63,不用转码; A-2
* 2、如果b[i]全大于0,那么为英文字符串,不用转码; B-1
* 3、如果b[i]有小于0的,那么已经乱码,要转码。 C-1
*/
public static String GetResponseString(string url, string method, string data = null, string reffer = null, CookieContainer cookieContainer = null)
{
HttpWebResponse res = GetResponse(url, method, data, reffer, cookieContainer);
Stream receiveStream = res.GetResponseStream();
if (res.ContentEncoding.ToLower().Contains("gzip"))
{
receiveStream = new GZipStream(receiveStream, CompressionMode.Decompress);
}
else
if (res.ContentEncoding.ToLower().Contains("deflate"))
{
receiveStream = new DeflateStream(receiveStream, CompressionMode.Decompress);
}
//获取网页字符编码描述信息
//Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//string webCharSet = charSetMatch.Groups[2].Value.Trim('\"');
Encoding encode = Encoding.UTF8;
if (res.CharacterSet != null && res.CharacterSet != "")
{
encode = Encoding.GetEncoding(res.CharacterSet);
}
StreamReader sr = new StreamReader(receiveStream, encode);
string result = sr.ReadToEnd();
sr.Close();
if (res.CharacterSet != null && res.CharacterSet.ToLower() == "iso-8859-1")
{
Match charSetMatch = Regex.Match(result, "<meta.*?charset=(.*?)[\"|']", RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (charSetMatch.Success)
{
string webCharSet = charSetMatch.Groups[1].Value.Replace("\"", "").Replace("'", "").Trim();
if (Encoding.GetEncoding(webCharSet) != encode)
{
// result = Encoding.GetEncoding(webCharSet).GetString(buffer);
sr = new StreamReader(receiveStream, Encoding.GetEncoding(webCharSet));
result = sr.ReadToEnd();
sr.Close();
}
}
}
receiveStream.Close();
res.Close();
return result;
}
珂珂的个人博客 - 一个程序猿的个人网站