乡下人产国偷v产偷v自拍,国产午夜片在线观看,婷婷成人亚洲综合国产麻豆,久久综合给合久久狠狠狠9

  • <output id="e9wm2"></output>
    <s id="e9wm2"><nobr id="e9wm2"><ins id="e9wm2"></ins></nobr></s>

    • 分享

      JAVA多線(xiàn)程網(wǎng)絡(luò)爬蟲(chóng)的代碼實(shí)現(xiàn)

       huhuwoo 2015-10-28
      ……
      public class MyCrawler {
          public static BDBFrontier visitedFrontier;
          public static BDBFrontier unvisitedFrontier;
          private static int num = 0;  
           
          public MyCrawler() {
              try{
                  if(visitedFrontier == null){
                      visitedFrontier = new BDBFrontier(CrawlConfig.CRAWL_VISITED_FRONTIER);      //采用Nosql數(shù)據(jù)庫(kù)存儲(chǔ)訪問(wèn)地址方式
                      visitedFrontier.clearAll();
                  }
                  if(unvisitedFrontier == null) {
                      unvisitedFrontier = new BDBFrontier(CrawlConfig.CRAWL_UNVISITED_FRONTIER);
                      unvisitedFrontier.clearAll();
                  }
              }catch(Exception e) {
                  e.printStackTrace();
              }
          }
           
          private void initCrawlerWithSeeds(String[] seeds) {
              synchronized (this) {
                  try {
                      for(int i = 0;i<seeds.length;i++){
                          CrawlUrl url = new CrawlUrl();            //采用berkeleyDB形式
                          url.setOriUrl(seeds[i]);
                          unvisitedFrontier.putUrl(url);
                           
                      }
                  catch(Exception e) {
                      e.printStackTrace();
                  }
              }
          }
           
          public  void crawling(String[] seeds, int threadId) {
              try {
                  LinkFilter filter = new LinkFilter() {
                      @Override
                      public boolean accept(String url) {
                          Pattern pattern = Pattern.compile("^((https|http|ftp|rtsp|mms)?://)"
                                  "+(([0-9a-z_!~*'().&=+$%-]+: )?[0-9a-z_!~*'().&=+$%-]+@)?"
                                  "(([0-9]{1,3}\\.){3}[0-9]{1,3}"
                                  "|"
                                  "([0-9a-z_!~*'()-]+\\.)*"
                                  "([0-9a-z][0-9a-z-]{0,61})?[0-9a-z]\\."
                                  "[a-z]{2,6})"
                                  "(:[0-9]{1,4})?"
                                  "((/?)|"
                                  "(/[0-9a-z_!~*'().;?:@&=+$,%#-]+)+/?)$"); 
                          Matcher matcher = pattern.matcher(url);
                          boolean isMatch= matcher.matches();
                          if(isMatch && url.startsWith(CrawlConfig.CRAWL_LIMIT_PATH)) {
                              return true;
                          }
                          else {
                              return false;
                          }
                      }
                  };
               
                   
                  initCrawlerWithSeeds(seeds);
                   
                  //采用berkeleyDB方式存儲(chǔ)
                                      
                  CrawlUrl visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                  //visitedFrontier.putUrl(visitedCrawlUrl);
                   
                  do{
                      System.out.println("線(xiàn)程:" + threadId);
                      if(visitedCrawlUrl == null) {
                          continue;
                      }
                                   
                      String visitedUrl = visitedCrawlUrl.getOriUrl();
                      if(visitedFrontier.contains(visitedUrl)) {            //同步數(shù)據(jù)
                          visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                          continue;
                      }
                       
                      visitedFrontier.putUrl(visitedCrawlUrl);
                       
                      if(null == visitedUrl || "".equals(visitedUrl.trim())) {   //抓取的地址為空
                          visitedFrontier.putUrl(visitedCrawlUrl);
                          visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                          continue;
                      }
                       
                      try{
                          RetrievePage.downloadPage(visitedUrl);                    //下載頁(yè)面
                          Set<String> links = HtmlParserTool.extractLinks(visitedUrl, filter);
                          for(String link :links) {
                              if(!visitedFrontier.contains(link)
                                  &&!unvisitedFrontier.contains(link)    )
                              {
                                  CrawlUrl unvisitedCrawlUrl = new CrawlUrl();
                                  unvisitedCrawlUrl.setOriUrl(link);
                                  unvisitedFrontier.putUrl(unvisitedCrawlUrl);
                              }
                          }
                      }catch(ConnectTimeoutException e) {                            //超時(shí)繼續(xù)讀下一個(gè)地址
                          visitedFrontier.putUrl(visitedCrawlUrl);
                          visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                          num ++;
                          e.printStackTrace();
                          continue;
                      }catch(SocketTimeoutException e) {
                          visitedFrontier.putUrl(visitedCrawlUrl);
                          visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                          num ++;
                          e.printStackTrace();
                          continue;
                      }
                      visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                      num ++;
                       
                  }while(BDBFrontier.threads >0 && num < 1000);
              }
               
              catch (IOException e) {
                  e.printStackTrace();
              }
              catch(Exception e) {
                  e.printStackTrace();
              }
          }
           
      }

        本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間,所有內(nèi)容均由用戶(hù)發(fā)布,不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購(gòu)買(mǎi)等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請(qǐng)點(diǎn)擊一鍵舉報(bào)。
        轉(zhuǎn)藏 分享 獻(xiàn)花(0

        0條評(píng)論

        發(fā)表

        請(qǐng)遵守用戶(hù) 評(píng)論公約

        類(lèi)似文章 更多