之前看了很多获取网页源码的写法,要么有乱码,要么没考虑到gzip等压缩,比如有些网页这个HttpWebResponse 对象的CharacterSet是iso-8859-1,那么这种情况
我们根据它的charset读取。
还有个将流转成byte[] 数组的。。。发现GetResponseStream() 的流不能获取length属性
以下是源码
001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | public static HttpWebResponse GetResponse( string url, string method, string data = null , string reffer = null , CookieContainer cookieContainer = null ) { //System.Net.ServicePointManager.Expect100Continue = true; HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); // 需要注意的: //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 //这是就要具体问题具体分析比如在头部加入cookie // req.Headers.Add("Cookie", cookie); //这样可能需要一些重载方法。根据需要写就可以了 //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 req.Credentials = CredentialCache.DefaultCredentials; //如果服务器要验证用户名,密码 //NetworkCredential mycred = new NetworkCredential(struser, strpassword); //myWebClient.Credentials = mycred; //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) //req.Proxy = null; //解决.net 4 第一次请求慢的问题 req.Proxy = GetIEProxy(url); req.KeepAlive = true ; req.Method = method.ToUpper(); req.AllowAutoRedirect = true ; req.ContentType = "application/x-www-form-urlencoded" ; req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" ; req.Headers.Add( "Accept-Encoding" , "gzip,deflate" ); //req.Connection = "keep-alive"; req.Timeout = 10000; req.Referer = reffer; if (cookieContainer != null ) { req.CookieContainer = cookieContainer; } else { } req.UserAgent = IE9; if (method.ToUpper() == "POST" && data != null ) { byte [] postBytes = Encoding.UTF8.GetBytes(data); ; req.ContentLength = postBytes.Length; Stream st = req.GetRequestStream(); st.Write(postBytes, 0, postBytes.Length); st.Close(); } System.Net.ServicePointManager.ServerCertificateValidationCallback += (se, cert, chain, sslerror) =>{ return true ; }; HttpWebResponse res = (HttpWebResponse)req.GetResponse(); return res; } /** * 用getBytes(encoding):返回字符串的一个 byte 数组 * 当b[0]为 63时,应该是转码错误 * A、不乱码的汉字字符串: * 1、encoding用GB2312时,每 byte 是负数; * 2、encoding用ISO8859_1时,b[i]全是63。 * B、乱码的汉字字符串: * 1、encoding用ISO8859_1时,每 byte 也是负数; * 2、encoding用GB2312时,b[i]大部分是63。 * C、英文字符串 * 1、encoding用ISO8859_1和GB2312时,每 byte 都大于0; ** 总结:给定一个字符串,用getBytes( "iso8859_1" ) * 1、如果b[i]有63,不用转码; A-2 * 2、如果b[i]全大于0,那么为英文字符串,不用转码; B-1 * 3、如果b[i]有小于0的,那么已经乱码,要转码。 C-1 */ public static String GetResponseString( string url, string method, string data = null , string reffer = null , CookieContainer cookieContainer = null ) { HttpWebResponse res = GetResponse(url, method, data, reffer, cookieContainer); Stream receiveStream = res.GetResponseStream(); if (res.ContentEncoding.ToLower().Contains( "gzip" )) { receiveStream = new GZipStream(receiveStream, CompressionMode.Decompress); } else if (res.ContentEncoding.ToLower().Contains( "deflate" )) { receiveStream = new DeflateStream(receiveStream, CompressionMode.Decompress); } //获取网页字符编码描述信息 //Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline); //string webCharSet = charSetMatch.Groups[2].Value.Trim('\"'); Encoding encode = Encoding.UTF8; if (res.CharacterSet != null && res.CharacterSet != "" ) { encode = Encoding.GetEncoding(res.CharacterSet); } StreamReader sr = new StreamReader(receiveStream, encode); string result = sr.ReadToEnd(); sr.Close(); if (res.CharacterSet != null && res.CharacterSet.ToLower() == "iso-8859-1" ) { Match charSetMatch = Regex.Match(result, "<meta.*?charset=(.*?)[\"|']" , RegexOptions.IgnoreCase | RegexOptions.Multiline); if (charSetMatch.Success) { string webCharSet = charSetMatch.Groups[1].Value.Replace( "\"" , "" ).Replace( "'" , "" ).Trim(); if (Encoding.GetEncoding(webCharSet) != encode) { // result = Encoding.GetEncoding(webCharSet).GetString(buffer); sr = new StreamReader(receiveStream, Encoding.GetEncoding(webCharSet)); result = sr.ReadToEnd(); sr.Close(); } } } receiveStream.Close(); res.Close(); return result; } |