ホスト内のWebページをたどりリンクされているファイルを抜き出すスクリプト

Posted on Thu 06 August 2009 in Ruby

Hprocotを利用して、HTMLページでaタグのhref属性でリンクされているファイルを抜き出すスクリプトを書いてみた。
リンクされているのが同じホスト内のhtmlならそのhtmlもパースします。

#!ruby -Ks
require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'uri'
class GetWebFile

  def initialize()
    @file_ary = []
  end

  def parse_url(url)
    root_uri = URI(url)
    begin
      doc=''
      begin
        timeout(1){
          doc = Hpricot(open(url))
        }
        rescue TimeoutError => to
          warn to
        return
      end

      (doc/"a").each do |atag|
        if atag['href'].nil? || atag['href'].empty? then
          next
        elsif atag['href'] == "." then
          next
        end
        file_url = ''
        if /http:\/\// =~ atag['href'] then
          file_url = atag['href']
        else
          root_dir = File::dirname(root_uri.path)
          file_url = "#{root_uri.scheme}://#{root_uri.host}#{root_dir}/#{atag['href']}"
        end

        begin
          link_uri = URI(file_url)
          rescue URI::InvalidURIError => ex
          next
        end
        #同じホストのもののみ処理
        if root_uri.host != link_uri.host then
          next
        end
        #処理したものは飛ばす
        if not(@file_ary.index(file_url).nil?) then
          next
        end
        ext = File::extname(atag['href'])
        #ファイル配列へ追加
        if (ext != "") && (not(/\?/ =~ ext)) then
          @file_ary.push(file_url)
        end
        #リンクをたどる
        if ext == ".html" then
          parse_url(file_url)
        end
      end
      rescue OpenURI::HTTPError => ex
      warn ex.message
    end
  end

  def print_files()
    @file_ary.each do |file|
      puts file
    end
  end
end

if $0 == __FILE__ then
  get_web_file = GetWebFile.new
  get_web_file.parse_url(ARGV[0])
  get_web_file.print_files()
end