不廢話,上程式
package jsoup.stock;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
public class GetServerResource {
public static void main(String[] args) {
String surl = "https://tw.stock.yahoo.com/d/s/company_2330.html";
try {
URL url = new URL(surl);
HttpURLConnection con =
(HttpURLConnection)url.openConnection();
con.setDoInput(true);
con.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
con.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36");
// con.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
// con.setRequestProperty("Accept-Language", "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6,ko;q=0.5,vi;q=0.4");
con.setRequestMethod("GET");
InputStream is = con.getInputStream();
BufferedReader in = new BufferedReader
(new InputStreamReader(is));
PrintWriter pw = new PrintWriter(
new OutputStreamWriter(
new FileOutputStream
("d:\\yahooindex.html"),"BIG5")
);
String decodedString;
while((decodedString = in.readLine())!=null) {
pw.println(decodedString);
}
in.close();
pw.close();
System.out.println("網頁下載成功");
} catch(Exception e) {
e.printStackTrace();
}
}
}
這段程式,是用請求標頭去抓一個網頁
基本上就是F12打開把Request寫進JAVA程式,然後讓他用File.io產出你要的東西
可以是圖片、網頁、文字等等...
我目前用這方法抓圖片沒有問題
不過現在我遇到問題了,我想要抓股票的網頁
但是總是會顯示亂碼...
這是正常該顯示的網頁
但這是我顯示出的網頁..
基本上我BIG5跟UTF-8都試過了,都會這樣出現
該怎麼樣可以處理這個編碼問題呢...我該多寫什麼?
在InputStreamReader和OutputStreamWriter都指定為UTF-8的情況下:
BufferedReader in = new BufferedReader(new InputStreamReader(is,"UTF-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("yahooindex.html"),"UTF-8"));
多執行幾次, Yahoo!偶而會傳回正確的編碼, 大部份會傳回亂碼. 當Yahoo!傳回亂碼時, 在<!doctype html public "-//w3c//dtd html 4.01//en" "http://www.w3.org/tr/html4/strict.dtd">
前會有一大塊<style>
和<script>
的東西, 如下:
<style type='text/css'>
.ad_slug_table TD { font-size:0px;cursor:default; }
.ad_slug_table TD SPAN { font-size:0px;cursor:default; }
.ad_slug_table TD SPAN FONT { font-size:10px;cursor:default; }
.ad_slug_table TD SPAN.can_ad_slug { font-size:11px;text-align:right }
</style>
<script type='text/javascript' src='https://s.yimg.com/rq/darla/4-8-0/js/g-r-min.js'></script>
<script type="text/javascript">
if (!window.DARLA_CONFIG) {
DARLA_CONFIG = {"useYAC":0,"usePE":0,"servicePath":"https:\/\/tw.stock.yahoo.com\/sdarla\/php\/fc.php","xservicePath":"","beaconPath":"https:\/\/tw.stock.yahoo.com\/sdarla\/php\/b.php","renderPath":"","allowFiF":false,"srenderPath":"https:\/\/s.yimg.com\/rq\/darla\/4-8-0\/html\/r-sf.html","renderFile":"https:\/\/s.yimg.com\/rq\/darla\/4-8-0\/html\/r-sf.html","sfbrenderPath":"https:\/\/s.yimg.com\/rq\/darla\/4-8-0\/html\/r-sf.html","msgPath":"https:\/\/fc.yahoo.com\/unsupported-1946.html","cscPath":"https:\/\/s.yimg.com\/rq\/darla\/4-8-0\/html\/r-csc.html","root":"sdarla","edgeRoot":"https:\/\/s.yimg.com\/rq\/darla\/4-8-0","sedgeRoot":"https:\/\/s.yimg.com\/rq\/darla\/4-8-0","version":"4-8-0","tpbURI":"","hostFile":"https:\/\/s.yimg.com\/rq\/darla\/4-8-0\/js\/g-r-min.js","beaconsDisabled":true,"rotationTimingDisabled":true,"fdb_locale":"\u60a8\u70ba\u4ec0\u9ebc\u4e0d\u559c\u6b61\u9019\u5247\u5ee3\u544a\uff1f|\u5ee3\u544a\u5167\u5bb9\u4ee4\u4eba\u53cd\u611f|\u5176\u4ed6|\u611f\u8b1d\u60a8\u5354\u52a9\u6211\u5011\u6539\u5584\u60a8\u7684 Yahoo\u5947\u6469\u9ad4\u9a57|\u5ee3\u544a\u5167\u5bb9\u8207\u6211\u6c92\u4ec0\u9ebc\u95dc\u806f|\u4f7f\u4eba\u5206\u5fc3|\u6211\u4e0d\u559c\u6b61\u9019\u652f\u5ee3\u544a|\u9001\u51fa|\u5b8c\u6210|\u70ba\u4ec0\u9ebc\u6211\u6703\u770b\u898b\u5ee3\u544a\uff1f|\u9032\u4e00\u6b65\u4e86\u89e3\u60a8\u7684\u610f\u898b\u3002|\u60f3\u8981\u4f7f\u7528\u7121\u5ee3\u544a\u6536\u4ef6\u5323\u55ce\uff1f\u5feb\u4f86\u5347\u7d1a Yahoo Mail Pro\uff01|\u7acb\u5373\u5347\u7d1a","lang":"zh-Hant-TW"};
}
DARLA_CONFIG.events = {
'renderN':
{
name: 'renderN',
ps: 'N',
sp: 152952227,
npv: 1,
ssl: 1
}, 'renderHEADR':
{
name: 'renderHEADR',
ps: 'HEADR',
sp: 152952227,
npv: 1,
ssl: 1
}, 'renderFOOT':
{
name: 'renderFOOT',
ps: 'FOOT',
sp: 152952227,
npv: 1,
ssl: 1
},
'AUTO':
{
name: "AUTO",
sp: 152952227,
autoStart: 1,
autoIV:1,
ps: "N,HEADR,FOOT",
ssl: 1,
secure: 1,
ref: document.location.href // to support testing ads.
}
};
DARLA_CONFIG.positions = {
N: {wcpx: 1, hcpx: 1, dest: 'tgtN', clean: 'boxN' ,fdb: { "on": 1 }, metaSize: true},
HEADR: {wcpx: 1, hcpx: 1, dest: 'tgtHEADR', clean: 'boxHEADR' ,fdb: { "on": 1 }, metaSize: true},
FOOT: {wcpx: 1, hcpx: 1, dest: 'tgtFOOT', clean: 'boxFOOT' ,fdb: { "on": 1 }, metaSize: true}
};
</script><!doctype html public "-//w3c//dtd html 4.01//en" "http://www.w3.org/tr/html4/strict.dtd">
Yahoo!的網站很古老, 也許是因為java HttpURLConnection對Yahoo!的處理出了問題? 也許是Yahoo!的問題? 因為String surl = "https://ithelp.ithome.com.tw/", 不會有問題.
用原版的程式(只改存檔路徑及檔名)結果如下
改一列之後結果如下
改的是以下這列
PrintWriter pw = new PrintWriter(
new OutputStreamWriter(
new FileOutputStream
("yahooindex.html"),"UTF-8")