/** * GetURLs2.java **/ import java.io.*; import java.net.*; import java.util.*; public class GetURLs2 { /** タグのリーダ */ ReadTag2 reader; public GetURLs2(URL theURL) throws IOException { reader = new ReadTag2(theURL); } public GetURLs2(String theURL) throws MalformedURLException, IOException { reader = new ReadTag2(theURL); } /** 抽出対象となるタグ */ public final static String[] wantTags = { "","","","","<b>","<i>","<meta","<strong>","<title>","<H1","<h1","<H2","<h2","<H3","<h3","<H4","<h4","<H5","<h5"}; public ArrayList getTags() throws IOException { ArrayList TopicTags = new ArrayList(); String tag; try{ while ((tag = reader.nextTag()) != null) { StringBuffer EndTag = new StringBuffer(); EndTag.append("</"); for (int i = 0; i < wantTopicTags.length; i++) { if (tag.startsWith(wantTopicTags[i])){ if(i==2 || i==7){//"<META" or "<meta"の時 TopicTags.add(tag); break; } String tmp_EndTag0 = wantTopicTags[i]; int length = tmp_EndTag0.length(); String tmp_EndTag = tmp_EndTag0.substring(1,length); EndTag.append(tmp_EndTag); String str_EndTag = EndTag.toString(); String TopicTag = reader.readTopicTag(str_EndTag); StringBuffer topictag = new StringBuffer(tag); topictag.append(TopicTag); topictag.toString(); //System.out.println("topictag:"+topictag); TopicTags.add(topictag); break; } //end if } // end for } // end while }catch(IOException e){ System.out.println("getTags Error:"+e); } return TopicTags; } public void close() throws IOException { if (reader != null) reader.close(); } /*************************************************************/ public static Maps todo(Maps MAP) throws MalformedURLException, IOException { int i=-1; int RSsize = MAP.RootSetSize; System.out.print("\nRootSet");; while(RSsize>i+1){ try{ i++; Object objurl = MAP.BaseSet.get(i); String theURL = objurl.toString(); System.out.print("["+i+"]"); if(theURL.startsWith("http://www.nias.affrc.go.jp/")){ System.out.println("continue"); continue; } GetURLs2 gu2 = new GetURLs2(theURL); ArrayList urls = new ArrayList(gu2.getURLs()); /*取ってきたアンカーを表示 System.out.println("***************"); Tools.Output(urls); System.out.println("***************"); */ // 親URLの修正 String tmp_theURL = Change.Change_theURL(theURL); int size = urls.size(); for(int j=0;j<size;j++){ Object obj = urls.get(j); String tmp_url1 = obj.toString(); /***<A HREF="???.html">から[???.html]を抜きだす***/ int e_index = tmp_url1.indexOf("\"",9); String tmp_url2 = tmp_url1.substring(9,e_index); /********** 抜きだし完了 **********/ if(Check.EndCheck(tmp_url2)==-1) //拡張子等判定 continue; /***[if] 「<a href="http:」で始まらない場合、親URLに子リンクをつなげる***/ String href = tmp_url1.substring(0,14); href = href.toLowerCase(); if(!href.equals("<a href=\"http:")){ int cd =0; int home=0; StringBuffer cd_url = new StringBuffer(tmp_url2); try{// 子リンクの修正 while(tmp_url2.startsWith(".") || tmp_url2.startsWith("/")){ if(tmp_url2.startsWith("./")){ cd_url.delete(0,2); tmp_url2 = new String(cd_url); }else if(tmp_url2.startsWith("../")){ cd_url.delete(0,3); tmp_url2 = new String(cd_url); cd ++; }else if(tmp_url2.startsWith("/")){ cd_url.delete(0,1); tmp_url2 = new String(cd_url); home=1; break; }else{ System.out.println("cd_else:" + tmp_url2); cd = -1; break; } } }catch(Exception e){ System.out.println("cd_error:" + e); cd =-1; } StringBuffer tmp_theURL1; if(cd==-1) continue; if(cd>0 || home>0){// 親URLのディレクトリ修正 String cd_theURL = Change.cd_theURL(tmp_theURL,cd,home); tmp_theURL1 = new StringBuffer(cd_theURL); }else{ tmp_theURL1 = new StringBuffer(tmp_theURL); } tmp_theURL1.append(tmp_url2); String tmp_theURL2 = tmp_theURL1.toString(); if(Check.LoopCheck(MAP,tmp_theURL2,theURL)==0){ MAP.LHMap.put(tmp_theURL2,theURL); if(Check.LoopCheck_BS(MAP,tmp_theURL2)==0) MAP.BaseSet.add(tmp_theURL2); }else{//同じkeyがある場合→同じ親かチェック if(Check.LoopCheck_Oya(MAP,tmp_theURL2,theURL)==0){ MAP.DiffOya.add(theURL); MAP.DiffOya.add(tmp_theURL2); } } } // end if{ /***[else] 「<a href="http:」で始まる場合、抜き出したリンクをそのまま追加***/ else{ try{ if(Check.LoopCheck(MAP,tmp_url2,theURL)==0){ MAP.LHMap.put(tmp_url2,theURL); if(Check.LoopCheck_BS(MAP,tmp_url2)==0) MAP.BaseSet.add(tmp_url2); }else{ if(Check.LoopCheck_Oya(MAP,tmp_url2,theURL)==0){ MAP.DiffOya.add(theURL); MAP.DiffOya.add(tmp_url2); } } }catch(Exception e){ System.out.println("GetURLs Error:" + e); break; } } } // end for() }catch(Exception e){ System.out.print(" Not anchor "); } } //end while return(MAP); } /******topictag*************************************/ public static Maps SearchTopicTags2(Maps MAP) throws MalformedURLException, IOException { System.out.println("\nSearchTopicTags2"); try{ for(int i=0;i<(MAP.BaseSet.size());i++){ try{ System.out.print(i+" "); String str_i = String.valueOf(i); MAP.TopicTags.add(str_i); Object URL = MAP.BaseSet.get(i); System.out.println(URL); String URLURL = URL.toString(); GetURLs2 gu2 = new GetURLs2(URL.toString()); ArrayList tmp_TT = gu2.getTags(); Iterator urliterator = tmp_TT.iterator(); while (urliterator.hasNext()) { // System.out.println(urliterator.next()); MAP.TopicTags.add(urliterator.next()); } }catch(Exception e){ System.out.println("SearchTopicTags2 for() Error:"+e); } MAP.TopicTags.add("0"); } }catch(Exception e){ System.out.println("SearchTopicTags2 Error:"+e); } MAP.TopicTags.add("00"); System.out.println("end"); return MAP; } }