| 注册
请输入搜索内容

热门搜索

Java Linux MySQL PHP JavaScript Hibernate jQuery Nginx
fefet
10年前发布

Golang网页下载示例

package main     /*   * 中文编码问题   */     import (      "errors"      "flag"      "fmt"      query "github.com/PuerkitoBio/goquery"      "golang.org/x/text/encoding/simplifiedchinese"      "io/ioutil"      "net/http"      "os"      "path/filepath"      "runtime"      "strings"      "sync"  )        var (      np = runtime.NumCPU()      _  = runtime.GOMAXPROCS(np)  )     var wg sync.WaitGroup     type Folder struct {      Url string      Dir string  }     type File struct {      Url  string      Dir  string      Name string  }        func checkErr(err error) {      if err != nil {          fmt.Printf("%v\n", err.Error())          os.Exit(1)      }  }        func decodeToGBK(text string) (string, error) {      dst := make([]byte, len(text)*2)      tr := simplifiedchinese.GB18030.NewDecoder()      nDst, _, err := tr.Transform(dst, []byte(text), true)      if err != nil {          return text, err      }      return string(dst[:nDst]), nil  }        func printEach(index int, item *query.Selection) {      fmt.Println("Selection: ", item.Text())  }        func isDir(path string) bool {      return strings.HasSuffix(path, "/")  }        func makeFolder(item *query.Selection, url, dir string) (f *Folder, err error) {      tx := item.Text()      href, ok := item.Attr("href")      name, err := decodeToGBK(tx)      if err != nil {          return      }      if !ok {          err = errors.New("makeFolder : " + tx + " href属性不存在")          return      }      f = &Folder{Url: url + href, Dir: filepath.Join(dir, name)}      return  }        func makeFile(item *query.Selection, url, dir string) (f *File, err error) {      tx := item.Text()      href, ok := item.Attr("href")      if !ok {          err = errors.New("makeFile : " + tx + " href属性不存在")          return      }      name, err := decodeToGBK(tx)      if err != nil {          return      }      f = &File{Url: url + href, Dir: dir, Name: name}      return  }        func crawl(url, localDir string) {      doc, err := query.NewDocument(url)      // checkErr(err)      if err != nil {          fmt.Printf("%v\n", err.Error())          return      }         items := doc.Find("a")         dir := localDir         if !strings.HasSuffix(url, "/") {          url += "/"      }         crawlEach := func(i int, item *query.Selection) {          tx := item.Text()          if isDir(tx) {              folder, err := makeFolder(item, url, dir)              if err != nil {                  fmt.Printf("%v\n", err.Error())                  return              }              wg.Add(1)              go crawlFolder(folder)          } else {              file, err := makeFile(item, url, dir)              if err != nil {                  fmt.Printf("%v\n", err.Error())                  return              }              download(file)          }      }         items.Each(crawlEach)  }        func download(file *File) {      dir := file.Dir      url := file.Url      name := file.Name         if err := os.MkdirAll(dir, os.ModePerm); os.IsExist(err) {          fmt.Printf("%x is exist\n", dir)      } else {          os.Chmod(dir, os.ModePerm)      }      resp, err := http.Get(url)      if err != nil {          fmt.Printf("%v\n", err.Error())          return      }      defer resp.Body.Close()         body, err := ioutil.ReadAll(resp.Body)      if err != nil {          fmt.Printf("%v\n", err.Error())          return      }         fp := string([]rune(filepath.Join(dir, name)))         err = ioutil.WriteFile(fp, body, 0777)      if err != nil {          fmt.Printf("%v fp:[%v]\n", err.Error(), fp)          return      }      fmt.Printf("Download: %+v\n", file)  }        func crawlFolder(folder *Folder) {      url := folder.Url      dir := folder.Dir         crawl(url, dir)      wg.Done()  }        func main() {      host := flag.String("host", "http://localhost:8000", "HTTP服务地址Host")      location := flag.String("locate", "E:/Crawler下载文件", "本地文件系统绝对路径")      flag.Parse()      crawl(*host, *location)      wg.Wait()  }