I got sick of re-writing the crawl login again and again, and I don't like using the attribute CSS selectors [attr=value]

So I wrote this.


require 'rubygems'
require 'open-uri'
require 'net/http'
require 'hpricot'

class Rule
attr_accessor :tag, :callback
def initialize(tag, &callback)
@tag = tag
@callback = callback
@preconditions = []
end

def all_ok?(tag)
@preconditions.each do |condition|
return false unless condition.call(tag)
end
true
end

def with(condition, &callback)
@preconditions << condition
self.callback = callback
self
end

def with(attribute, value, &callback)
@preconditions << lambda {|tag| tag.attributes[attribute.to_s].include?(value)}
self.callback = callback
self
end
end

class Farser
def initialize(source)
@source = Hpricot(open(source))
@rules = []
end

def find(tag, &callback)
rule = Rule.new(tag, &callback)
@rules << rule
rule
end

def farse
@rules.each do |rule|
@source.search(rule.tag) do |found|
next unless rule.all_ok?(found)
rule.callback.call(found)
end
end
end

def self.fetch(from, to)
uri = URI.parse(from)
return if File.exists?(to)
File.open(to, "wb") do |f|
Net::HTTP.start(uri.host) { |http|
resp = http.get(uri.path)
f.write(resp.body)
}
end
end
end


The usage is pretty simple


farser = Farser.new("http://something.com")
farser.find("a").with(:href, 'index').with(:name, 'someone') do |index_link|
end


No error handling and super duper method chaining.

I love it!