(golang)HTTP基本认证机制及使用gocolly登录爬取
內(nèi)網(wǎng)有個網(wǎng)頁用了HTTP基本認(rèn)證機(jī)制,想用gocolly爬取,不知道怎么登錄,只好研究HTTP基本認(rèn)證機(jī)制
參考這里:https://www.jb51.net/article/89070.htm??
下面開始參考作者dotcoo了:-)
看了<<http權(quán)威指南>>第12章HTTP基本認(rèn)證機(jī)制(本站下載地址://www.jb51.net/books/93254.html),感覺講的蠻詳細(xì)的,寫了一個小小例子測試.
請求響應(yīng)過程:
==> GET /hello HTTP/1.1 Host: 127.0.0.1:12345 <== HTTP/1.1 401 Unauthorized WWW-Authenticate: Basic realm="Dotcoo User Login" ==> GET /hello HTTP/1.1 Host: 127.0.0.1:12345 Authorization: Basic YWRtaW46YWRtaW5wd2Q= <== HTTP/1.1 200 OK Content-Type: text/plain; charset=utf-8golang HTTP基本認(rèn)證機(jī)制的實(shí)現(xiàn)代碼
package main import ("fmt""io""net/http""log""encoding/base64""strings" ) // hello world, the web server func HelloServer(w http.ResponseWriter, req *http.Request) {auth := req.Header.Get("Authorization")if auth == "" {w.Header().Set("WWW-Authenticate", `Basic realm="Dotcoo User Login"`)w.WriteHeader(http.StatusUnauthorized)return}fmt.Println(auth)auths := strings.SplitN(auth, " ", 2)if len(auths) != 2 {fmt.Println("error")return}authMethod := auths[0]authB64 := auths[1]switch authMethod {case "Basic":authstr, err := base64.StdEncoding.DecodeString(authB64)if err != nil {fmt.Println(err)io.WriteString(w, "Unauthorized!\n")return}fmt.Println(string(authstr))userPwd := strings.SplitN(string(authstr), ":", 2)if len(userPwd) != 2 {fmt.Println("error")return}username := userPwd[0]password := userPwd[1]fmt.Println("Username:", username)fmt.Println("Password:", password)fmt.Println()default:fmt.Println("error")return}io.WriteString(w, "hello, world!\n") } func main() {http.HandleFunc("/hello", HelloServer)err := http.ListenAndServe(":8000", nil)if err != nil {log.Fatal("ListenAndServe: ", err)} }試驗(yàn)了上面的例子后,基本明白了HTTP基本認(rèn)證的過程。但是怎么用gocolly訪問呢?
參考:https://stackoverflow.com/questions/50576248/using-colly-framework-i-cant-login-to-the-evernote-account
但是答復(fù)者M(jìn)atías Insaurralde提供的模擬瀏覽器訪問的例子編譯不通過,不明白其中的hptsKey的意思。代碼放在下面供參考(可跳過):
package evernoteimport ("bytes""errors""fmt""io/ioutil""net/http""net/http/cookiejar""net/url""regexp""strings" )const (evernoteLoginURL = "https://www.evernote.com/Login.action" )var (evernoteJSParamsExpr = regexp.MustCompile(`document.getElementById\("(.*)"\).value = "(.*)"`)evernoteRedirectExpr = regexp.MustCompile(`Redirecting to <a href="(.*)">`)errNoMatches = errors.New("No matches")errRedirectURL = errors.New("Redirect URL not found") )// EvernoteClient wraps all methods required to interact with the website. type EvernoteClient struct {Username stringPassword stringhttpClient *http.Client// These parameters persist during the login process:hpts stringhptsh string }// NewEvernoteClient initializes a new Evernote client. func NewEvernoteClient(username, password string) *EvernoteClient {// Allocate a new cookie jar to mimic the browser behavior:cookieJar, _ := cookiejar.New(nil)// Fill up basic data:c := &EvernoteClient{Username: username,Password: password,}// When initializing the http.Client, copy default values from http.DefaultClient// Pass a pointer to the cookie jar that was created earlier:c.httpClient = &http.Client{Transport: http.DefaultTransport,CheckRedirect: http.DefaultClient.CheckRedirect,Jar: cookieJar,Timeout: http.DefaultClient.Timeout,}return c }func (e *EvernoteClient) extractJSParams(body []byte) (err error) {matches := evernoteJSParamsExpr.FindAllSubmatch(body, -1)if len(matches) == 0 {return errNoMatches}for _, submatches := range matches {if len(submatches) < 3 {err = errNoMatchesbreak}key := submatches[1]val := submatches[2]if bytes.Compare(key, hptsKey) == 0 {e.hpts = string(val)}if bytes.Compare(key, hptshKey) == 0 {e.hptsh = string(val)}}return nil }// Login handles the login action. func (e *EvernoteClient) Login() error {// First step: fetch the login page as a browser visitor would do:res, err := e.httpClient.Get(evernoteLoginURL)if err != nil {return err}if res.Body == nil {return errors.New("No response body")}body, err := ioutil.ReadAll(res.Body)if err != nil {return err}err = e.extractJSParams(body)if err != nil {return err}// Second step: we have extracted the "hpts" and "hptsh" parameters// We send a request using only the username and setting "evaluateUsername":values := &url.Values{}values.Set("username", e.Username)values.Set("evaluateUsername", "")values.Set("analyticsLoginOrigin", "login_action")values.Set("clipperFlow", "false")values.Set("showSwitchService", "true")values.Set("hpts", e.hpts)values.Set("hptsh", e.hptsh)rawValues := values.Encode()req, err := http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))if err != nil {return err}req.Header.Set("Accept", "application/json")req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")req.Header.Set("x-requested-with", "XMLHttpRequest")req.Header.Set("referer", evernoteLoginURL)res, err = e.httpClient.Do(req)if err != nil {return err}body, err = ioutil.ReadAll(res.Body)if err != nil {return err}bodyStr := string(body)if !strings.Contains(bodyStr, `"usePasswordAuth":true`) {return errors.New("Password auth not enabled")}// Third step: do the final request, append password to form data:values.Del("evaluateUsername")values.Set("password", e.Password)values.Set("login", "Sign in")rawValues = values.Encode()req, err = http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))if err != nil {return err}req.Header.Set("Accept", "text/html")req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")req.Header.Set("x-requested-with", "XMLHttpRequest")req.Header.Set("referer", evernoteLoginURL)res, err = e.httpClient.Do(req)if err != nil {return err}// Check the body in order to find the redirect URL:body, err = ioutil.ReadAll(res.Body)if err != nil {return err}bodyStr = string(body)matches := evernoteRedirectExpr.FindAllStringSubmatch(bodyStr, -1)if len(matches) == 0 {return errRedirectURL}m := matches[0]if len(m) < 2 {return errRedirectURL}redirectURL := m[1]fmt.Println("Login is ok, redirect URL:", redirectURL)return nil } After you successfully get the redirect URL, you should be able to send authenticated requests as long as you keep using the HTTP client that was used for the login process, the cookie jar plays a very important role here.To call this code use:func main() {evernoteClient := NewEvernoteClient("user@company", "password")err := evernoteClient.Login()if err != nil {panic(err)} }只好自己寫,經(jīng)反復(fù)試驗(yàn),發(fā)現(xiàn)對于本文開頭自己寫的server,只需以下代碼即可通過驗(yàn)證,輸出了hello,world!(將訪問方式改為POST也一樣。)
package mainimport ("fmt""io/ioutil""net/http" )// Login handles the login action. func Login() {//生成client 參數(shù)為默認(rèn)client := &http.Client{}//要訪問的urlurl := "http://localhost:8000/hello"//要提交的請求req, _ := http.NewRequest("GET", url, nil)//最重要的一句,用戶名和密碼可隨意寫req.SetBasicAuth("aa", "bb")fmt.Println("POST訪問")//返回結(jié)果res, _ := client.Do(req)defer res.Body.Close()fmt.Println("header:")header := res.Headerfmt.Println(header)fmt.Println("realm:")basicRealm := res.Header.Get("Www-Authenticate")fmt.Println(basicRealm)fmt.Println("body:")body, _ := ioutil.ReadAll(res.Body)fmt.Println(string(body))}func main() { Login() }查看SetBasicAuth的定義為(liteide中在光標(biāo)位置按Ctrl+shift+J):
func (r *Request) SetBasicAuth(username, password string) {r.Header.Set("Authorization", "Basic "+basicAuth(username, password)) }而basicAuth的定義為
func basicAuth(username, password string) string {auth := username + ":" + passwordreturn base64.StdEncoding.EncodeToString([]byte(auth)) }那么,用gocolly訪問的代碼如下:
package mainimport ("encoding/base64""fmt""net/http""github.com/gocolly/colly" )func basicAuth(username, password string) string {auth := username + ":" + passwordreturn base64.StdEncoding.EncodeToString([]byte(auth)) } func main() {c := colly.NewCollector()h := http.Header{}h.Set("Authorization", "Basic "+basicAuth("aaaa", "bbbb"))c.OnResponse(func(r *colly.Response) {//fmt.Println(r)fmt.Println(string(r.Body))})c.Request("GET", "http://localhost:8000/hello", nil, nil, h) }注:對于其他網(wǎng)站,也許要用Fiddler抓包,設(shè)置相應(yīng)的header和cookie才行。
轉(zhuǎn)載于:https://www.cnblogs.com/pu369/p/10408898.html
總結(jié)
以上是生活随笔為你收集整理的(golang)HTTP基本认证机制及使用gocolly登录爬取的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Tensorflow源码解析1 -- 内
- 下一篇: catkin_make与gtest出现冲