#!/usr/bin/env ruby #:mode=ruby: # This program is released under the GNU General Public Licence. Please see # http://opensource.org/licenses/gpl-license.php for more information. # Author: Doga Armangil, armangild@yahoo.com PODCATCHER_WEBSITE = 'http://podcatcher.rubyforge.org/' PODCATCHER_VERSION = '3.1.6' # todo: allow files to be selected not only by its MIME type, but also other attributes. Example: --content '^video/ width:680-1024 height:400' # todo: --proxy option # todo: download at most one enclosure or media:content per rss item # todo: support for --content and --language options in search mode # todo: code refactoring: do not duplicate option handling for 'options' option, factor out conversion between MIME type and file extension, avoid code duplication between implementations of download and search functions # todo: "item search" - search function that generates a feed containing relevant items of feeds (":item" or ":show" ?) # todo: option to specify share ratio for torrents # todo: symlink support in directory (for history, cache etc) # todo: improve playlist generation when using --strategy cache (only include audio and video content) # todo: improve --feeds implementation # todo: resuming of failed media downloads # todo: --subscriptions option (subscription d/l limit) # todo: informative exception messages # todo: only fetch bittorrent metainfo for d/l candidates # todo: option to download shows concurrently # todo: "lock" directory to prevent concurrency issues # todo: option to throttle non-BitTorrent downloads # 3.1.6: fix a bug whereby a failed content download caused all other content from the same feed to be ignored, fix ruby 1.9 compatibility bug (String#each becomes String#each_line) # 3.1.5: updated --arguments file format (# now comments out line), updated sponsor message # 3.1.4: added publication date to content titles in generated playlists, added better handling of invalid URLs in feeds and subscription lists (such URLs are now simply ignored instead of causing the whole document to be skipped) # 3.1.3: --restrictednames option is now enabled by default, fixed directory name generation bug that allowed '!' character when --perfeed and --restrictednames options were used simultaneously, updated sponsor message # 3.1.2: modified the help text that appears when --help option is used, updated sponsor message # 3.1.1: fixed a bug in verbose mode that caused content to be listed twice if it is declared as both RSS enclosure and Media RSS content, changed the sponsor message # 3.1.0: added support for yyyy and yyyy.mm formats for --horizon parameter # 3.0.0: added the --cachedir option for explicitely specifying cache directory, added --language option for selecting feeds by language, added the --horizon option that prevents the downloading of content older than a given date, added --restrictednames option for using content subdirectory and file names that are acceptable for restrictive filesystems such as VFAT, http://search.yahoo.com/mrss is now accepted as namespace for RSS Media module, fixed a bug in update checking (flash now only appears if podcatcherstats version is newer than current one), fixed a bug that caused votes to be sent for feeds that have file URLs or filenames. # 2.0.1: fixed Yahoo Media RSS module handling bug # 2.0.0: fixed a bug that caused the generation of invalid playlists for feeds containing control characters (such as Ctrl-M) in their title or in the title of one of its entries, added --order option that determines feed order, changed default feed order from 'sequential' to 'random', all content is downloaded by default (not only MP3), changed default cache size to 512MB, added support for the Yahoo Media RSS module (http://search.yahoo.com/mrss), added strategies for downloading content in chronological order (chron_one, chron, chron_all), added -C option that specifies the types of content that are to be received (overrides the default types), added -o option for reading options from a file, added -A option for reading arguments from a file, changed the default download strategy to 'one', added -V alias for --version option, fixed a bug that caused the order of feeds to be ignored in OPML files, fixed a bug that caused downloads of some video files to fail in vodcatcher mode, added --checkforupdate option for informing the user when a new version is available, added --vote option for voting in favour of downloaded podcasts at podcatcherstats.com # 1.3.7: added status code and content type check when downloading a media file using HTTP, removed some debugging comments # 1.3.5: fixed a bug that caused wrong cache filenames to be generated when an HTTP redirection was received from a server, added Operating System and processor information to the User-Agent HTTP header sent to web servers # 1.3.4: fixed the help message # 1.3.3: added the -p option that assigns a separate cache subfolder to each feed # 1.3.2: bug fix # 1.3.1: added robust handling of subscription lists that directly link to media files (such links are now ignored), fixed an OPML generation bug for interrupted searches # 1.3.0: added search function for online podcast directories such as the iPodder podcast directory, added xspf support # 1.2.0: added support for decentralized subscription lists (i.e. subscription lists that point to other subscription lists), fixed a bug that sometimes caused an invalid Referer header to be sent in HTTP requests, added the -f option, added support for Atom feeds that do not list items in reverse chronological order, added support for RSS/Atom feeds as command line arguments, added support for Extended M3U and Extended PLS playlist formats, M3U playlists can now also be generated in vodcatcher mode, m3u is now the default type in vodcatcher mode, added "cache" strategy which deprecates -c option # 1.1.1: added support for iTunes .pcast subscription files # 1.1.0: names of media files downloaded via BitTorrent are now preserved, done some refactoring so that the script can function as a vodcatcher # 1.0.4: added support for RSS feeds that do not list items in reverse chronological order # 1.0.3: fixed an RSS parsing bug that caused enclosures of some feeds to be ignored # 1.0.2: fixed some minor MP3 file naming bugs # 1.0.1: names of downloaded MP3 files are now preserved # 1.0.0: added ATOM support # 0.4.0: added duplicate removal for MP3, RSS/Atom and OPML URLs and pathnames; added the -i option that attempts to increase the listen-time given to podcasts which frequently release short shows # 0.3.2: fixed BitTorrent handling bug # 0.3.1: added robust handling of network exceptions, removed support for Ctrl-C to terminate execution # 0.3.0: added support for opml format used by podcastalley, added podcast title information in playlists, reduced RAM usage by not loading the history file in memory, history file and playlist are now updated after each download # 0.2.1: added support for Ctrl-C to terminate execution; added robust handling of some bad command line arguments; (James Carter patch) fixed the "OPML truncation" issue where a bad RSS feed was considered the last of the list # 0.2.0: added a new download strategy ("one"); added support for more than one OPML argument, fixed some issues # 0.1.7: bug fix # 0.1.6: added internal Bittorrent support, fixed flawed handling of some exceptions # 0.1.5: changed -d option description, added external handling of Bittorrent files # 0.1.4: bug-fix, robust handling of bad //enclosure/@length attributes, handling of relative enclosure URLs # 0.1.3: podcast download strategies (and changed default), download retries # 0.1.2: added TOX playlist support, added HTTP and FTP support for the OPML parameter, done some code clean-up # 0.1.1: fixed RSS parsing issue # 0.1.0: initial version require 'uri' require 'open-uri' require 'ostruct' require 'optparse' require 'pathname' require 'date' require 'cgi' require 'yaml' require 'net/http' require 'rexml/document' include REXML #PODCATCHER_ENV = :development PODCATCHER_ENV = :production USER_AGENT = "podcatcher/#{PODCATCHER_VERSION} Ruby/#{RUBY_VERSION} #{RUBY_PLATFORM}" UPDATE_CHECK_INTERVAL = 6 #months opt = OpenStruct.new opt.PLAYLIST_TYPES = [:m3u, :smil, :pls, :asx, :tox, :xspf] opt.playlist_type = opt.PLAYLIST_TYPES[0] opt.size = 512 opt.content_type = Regexp.new '' opt.DESCRIPTION = <=1 when 'memsize' if value.instance_of?(Fixnum) opt.memsize = value opt.memsize = nil if opt.memsize<1 end when 'content' begin opt.content_type = Regexp.new(value.downcase) rescue Exception $stderr.puts "Error: '#{value.downcase}' is not a valid regular expression and will be ignored" end when 'language' opt.language = value.split ',' for i in 0...opt.language.size opt.language[i].downcase! opt.language[i] = opt.language[i].split '-' end when 'order' opt.order = value.to_sym if opt.ORDERS.detect{|s| value.to_sym == s} when 'function' opt.function = value.to_sym if opt.FUNCTIONS.detect{|s| value.to_sym == s} when 'feeds' if value.instance_of?(Fixnum) opt.feeds = value opt.feeds = nil if opt.feeds<1 end when 'horizon' begin date = value.split '.' if (1..3).include? date.size while date.size < 3 date << '01' end opt.horizon = Date.parse date.join('-') end rescue ArgumentError end when 'torrentdir' dir = Pathname.new value if dir.exist? and dir.directory? opt.torrent_dir = dir end when 'uploadrate' opt.upload_rate = value if value.instance_of?(Fixnum) and value>=1 when 'itemsize' opt.itemsize = value if value.instance_of?(Fixnum) and value>=0 when 'perfeed' opt.per_feed = value if value.instance_of?(FalseClass) or value.instance_of?(TrueClass) when 'cache' opt.strategy = :cache if value.instance_of?(TrueClass) when 'empty' opt.empty = value if value.instance_of?(FalseClass) or value.instance_of?(TrueClass) when 'asif' opt.simulate = value if value.instance_of?(FalseClass) or value.instance_of?(TrueClass) when 'checkforupdate' opt.check_for_update = value if value.instance_of?(FalseClass) or value.instance_of?(TrueClass) when 'vote' opt.vote = value if value.instance_of?(FalseClass) or value.instance_of?(TrueClass) when 'verbose' opt.verbose = value if value.instance_of?(FalseClass) or value.instance_of?(TrueClass) when 'restrictednames' opt.restricted_names = value if value.instance_of?(FalseClass) or value.instance_of?(TrueClass) end end break end end c.separator "" c.separator "Usage examples:" c.separator " #{$0} --dir ~/podcasts http://podcastalley.com/PodcastAlleyTop50.opml > latest.m3u" c.separator "" c.separator " #{$0} --dir ~/podcasts rss.xml atom.xml *.pcast feeds.opml http://host/second.opml > latest.m3u" c.separator "" c.separator " #{$0} --dir ~/podcasts --strategy cache > cache.m3u" c.separator "" c.separator " cat feeds.opml | #{$0} --dir ~/podcasts > latest.m3u" c.separator "" c.separator " #{$0} -vd ~/podcasts -s 500 -m 10_000 -t tox feeds.opml > latest.tox" c.separator "" c.separator " #{$0} -vF search 'book health daily source code' 'http://www.ipodder.org/discuss/reader$4.opml' > results.opml" c.separator "" c.separator " #{$0} -F search -f 12 mac http://www.podfeed.net/opml/directory.opml > results.opml" end option_parser.parse! class Playlist def initialize(playlisttype) @playlisttype = playlisttype @audio_or_video = Regexp.new '^audio/|^video/' @size = 0 end def start() @str = "" case @playlisttype when :tox @str = "# toxine playlist \n" when :m3u @str = "#EXTM3U\n" when :pls @str = "[playlist]\n" when :asx @str = < END when :smil @str = < END when :xspf @doc = Document.new @doc.xml_decl.dowrite @doc.add_element Element.new("playlist") @doc.root.add_attribute "version", "1" @doc.root.add_attribute "xmlns", "http://xspf.org/ns/0/" @tracklist = Element.new("trackList") @doc.root.add_element @tracklist end print @str @str end def add(content) return unless content if content.mime return unless @audio_or_video =~ content.mime end @size+=1 feed_title = content.feed_title feed_title = '' unless feed_title feed_title = sanitize feed_title title = content.title title = '' unless title title = sanitize title title = "#{content.pub_date.strftime('%Y.%m.%d')} - "+title if content.pub_date entry = "" case @playlisttype when :m3u feed_title = feed_title.gsub(/,/," ") title = title.gsub(/,/," ") entry = "#EXTINF:-1,[#{feed_title}] #{title}\n#{content.file.to_s}\n" when :pls entry = "File#{@size}:#{content.file}\nTitle#{@size}:[#{feed_title}] #{title}\nLength#{@size}:-1\n" when :asx entry = " \n" when :smil entry = " \n" when :tox entry = "entry { \n\tidentifier = [#{feed_title}] #{title};\n\tmrl = #{content.file};\n};\n" when :xspf track = Element.new("track") @tracklist.add_element track title = Element.new("title") title.add_text "[#{feed_title}] #{title}" track.add_element title location = Element.new("location") location.add_text fileurl(content.file) track.add_element location end @str += entry print entry entry end def finish() res = "" case @playlisttype when :tox res = "# end " when :asx res = < END when :smil res = < END when :pls res = "NumberOfEntries=#{@size}\nVersion=2\n" when :xspf @doc.write $stdout, 0 end @str += res print res res end def to_s() if @doc @doc.to_s else @str end end private def fileurl(path) res = "" loop do path, base = path.split if base.root? if base.to_s != "/" res = "/"+CGI.escape(base.to_s)+res end break end res = "/"+CGI.escape(base.to_s)+res end "file://"+res end def sanitize(text) #removes invisible characters from text return nil unless text res = '' text.each_byte() do |c| case c when 0..31, 127 #control chars res << ' ' else res << c end end res end end class Update def initialize(dir) @now = Time.now @data = {'last-check' => @now, 'latest-version' => PODCATCHER_VERSION, 'latest-version-description' => ''} @server = URI.parse('http://www.podcatcherstats.com/podcatcher/latest_release') @server = URI.parse('http://0.0.0.0:3000/podcatcher/latest_release') if PODCATCHER_ENV == :development return unless dir return unless dir.directory? @file = dir + 'updates' if @file.exist? and @file.file? begin data = nil @file.open() do |f| data = YAML.load f end if data.instance_of? Hash if newer_or_equal? data['latest-version'] data.each() do |key, value| case key when 'last-check' @data[key] = value if value.instance_of? Time and value < @now when 'latest-version' @data[key] = value if value.instance_of? String when 'latest-version-description' @data[key] = value if value.instance_of? String end end end end rescue Interrupt @file.delete rescue SystemExit exit 1 rescue Exception @file.delete end end save exit 1 unless @file.file? end def check() if @now - @data['last-check'] > 60.0 * 60.0 * 24 * 30 * UPDATE_CHECK_INTERVAL @data['last-check'] = @now begin Net::HTTP.start(@server.host, @server.port) do |http| resp = http.get(@server.path, {'User-Agent' => USER_AGENT, 'Connection' => 'close'}) loop do break unless resp.code =~ Regexp.new('^2') doc = Document.new resp.body break unless doc and doc.root and doc.root.name == 'release' version = XPath.first doc.root, 'version' break unless version break unless newer? version.text description = XPath.first doc.root, 'description' if description description = description.text.strip else description = '' end @data['latest-version'] = version.join '.' @data['latest-version-description'] = description save break end # read resp.body end rescue Interrupt rescue SystemExit exit 1 rescue Exception end end flash end def to_s() res = '' if @data @data.each() do |key, value| res+= "#{key}: #{value}\n" end end res end private def flash() return unless newer? @data['latest-version'] #if equal? @data['latest-version'] #constants line_length = 70 p = '**** ' # $stderr.puts "" $stderr.puts p+"New release:" $stderr.puts p+"Version #{@data['latest-version']} is available at #{PODCATCHER_WEBSITE}." if @data['latest-version-description'].size>0 descr = [] @data['latest-version-description'].each() do |line| descr = descr + line.chomp.split(' ') end line = nil descr.each() do |word| if line and (line + ' ' + word).size>line_length $stderr.puts p+line line = nil end if line line += ' '+word else line = word end end $stderr.puts p+line if line end $stderr.puts "" end def save() @file.open('w') do |f| YAML.dump @data, f end end def compare_with(version) # Return values: -1: versioninstalled_version return -1 unless version version = version.strip.split '.' for i in 0...version.size version[i] = version[i].to_i end current_version = PODCATCHER_VERSION.strip.split '.' for i in 0...current_version.size current_version[i] = current_version[i].to_i end res = 0 for i in 0...version.size break if i>=current_version.size if current_version[i]>version[i] res = -1 break end if current_version[i] 1.0} @server = URI.parse('http://www.podcatcherstats.com/podcatcher/ping') @server = URI.parse('http://0.0.0.0:3000/podcatcher/ping') if PODCATCHER_ENV == :development return unless dir return unless dir.directory? @file = dir + 'votes' if @file.exist? and @file.file? data = nil begin @file.open() do |f| data = YAML.load f end rescue Interrupt @file.delete rescue SystemExit exit 1 rescue Exception @file.delete end if data.instance_of? Hash # $stderr.puts "votes file read" data.each() do |key, value| case key when 'ping-probability' @data[key] = value unless value<0.0 or 1.0 0 and feed[0].feedurl and feed[0].feedurl.size<255 and (not URI.parse(feed[0].feedurl).instance_of?(URI::Generic)) and sent_feeds < max_sent_feeds stats.root.add_element 'feed', {'url' => feed[0].feedurl} sent_feeds += 1 end end break unless sent_feeds>0 #send stats_str = '' stats.write stats_str if PODCATCHER_ENV != :production $stderr.puts "Sent:" $stderr.puts stats_str end change_state = nil Net::HTTP.start(@server.host, @server.port) do |http| resp = http.request_post @server.path, stats_str, 'User-Agent' => USER_AGENT, 'Content-Type' => 'application/xml', 'Connection' => 'close' if PODCATCHER_ENV != :production $stderr.puts "Received:" $stderr.puts "#{resp.body}" end change resp.body end @data['last-ping'] = now+0 break end rescue Interrupt # $stderr.puts "int1 #{$!}" rescue SystemExit exit 1 rescue Exception # $stderr.puts "exc #{$!}" end @data['last-session'] = now+0 save # $stderr.puts "#{to_s}" end def ping_search(opt, query) return unless opt return unless query return if opt.simulate now = Time.now begin loop do break unless opt.vote break unless ping? # $stderr.puts "ping.." stats = Document.new stats.add_element 'searching', {'query' => query} #state stats.root.add_element state_element #send stats_str = '' stats.write stats_str # $stderr.puts stats_str change_state = nil Net::HTTP.start(@server.host, @server.port) do |http| resp = http.request_post @server.path, stats_str, 'User-Agent' => USER_AGENT, 'Content-Type' => 'application/xml', 'Connection' => 'close' # $stderr.puts "#{resp.body}" change resp.body end @data['last-ping'] = now+0 break end rescue Interrupt # $stderr.puts "int1 #{$!}" rescue SystemExit exit 1 rescue Exception # $stderr.puts "exc #{$!}" end @data['last-session'] = now+0 save # $stderr.puts "#{to_s}" end def to_s() res = '' if @data @data.each() do |key, value| res+= "#{key}: #{value}\n" end end res end private def save() @file.open('w') do |f| YAML.dump @data, f end end def ping?() r = rand # $stderr.puts "random: #{r}, ping-probability: #{@data['ping-probability']}" return r < @data['ping-probability'] end def change(doc_str) return unless doc_str begin change_state = Document.new doc_str loop do break unless change_state break unless change_state.root break unless change_state.root.name == 'state' #ping-probability ping = change_state.root.attributes['ping'] if ping and ping.size>0 ping = ping.to_f unless ping<0.0 or 1.0 limit #shrink @history_old.delete if @history_old.exist? @history.rename @history_old @history.open("w") do |f| @history_old.each_line() do |url| f.print(url) if history_size <= limit history_size -= 1 end end @history_old.unlink end rescue Interrupt, SystemExit exit 1 rescue Exception $stderr.puts "Error: failure during history file clean-up." end if limit end end class Cache def initialize(opt) super() @opt = opt @@TORRENT = "application/x-bittorrent" @@MEDIA_RSS_NS = ['http://search.yahoo.com/mrss/'] @@MEDIA_RSS_NS << 'http://search.yahoo.com/mrss' @@ATOM_NS = Regexp.new "^http://purl.org/atom/ns#" #history @history = History.new opt.dir #stats @stats = Stats.new opt.dir #cache @cache_dir = opt.cachedir #opt.dir+"cache" @cache_dir.mkdir() unless @cache_dir.exist? exit 1 unless @cache_dir.directory? @cache_dir.each_entry() do |e| e = @cache_dir+e e = e.cleanpath next if e == @cache_dir or e == @cache_dir.parent if e.directory? #feed subfolder e.each_entry() do |e2| e2 = e+e2 next if e2.directory? if opt.empty unless opt.simulate or opt.strategy == :cache $stderr.puts "Deleting: #{e2}" if opt.verbose e2.delete end end end e.delete if e.entries.size == 2 elsif opt.empty unless opt.simulate or opt.strategy == :cache $stderr.puts "Deleting: #{e}" if opt.verbose e.delete end end end @cache = @cache_dir.entries.collect() do |e| e = @cache_dir+e e = e.cleanpath next if e == @cache_dir or e == @cache_dir.parent if e.file? content = OpenStruct.new content.file = e content.size = e.size content.title = e.to_s content elsif e.directory? e.entries.collect() do |e2| e2 = e+e2 if e2.file? content = OpenStruct.new content.file = e2 content.size = e2.size content.title = e2.to_s content else nil end end else nil end end @cache.flatten! @cache.compact! @cache.sort!() do |e,e2| e.file.mtime() <=> e2.file.mtime() end end def createplaylist(urls) playlist = Playlist.new @opt.playlist_type if @opt.strategy == :cache playlist.start @cache.reverse! @cache.each() do |content| playlist.add content end playlist.finish return playlist.to_s end playlist.start doc = nil if urls.size == 0 $stderr.puts "Reading document from standard input" if @opt.verbose begin xml = "" $stdin.each() do |e| xml += e end doc = OpenStruct.new doc.dom = Document.new(xml) doc = nil unless doc.dom rescue Interrupt, SystemExit exit 1 rescue Exception $stderr.puts "Error: unreadable document" doc = nil end end dochistory = [] feeds = [] urls.uniq! links = urls.collect() do |e| l = OpenStruct.new l.url = e l end loop do break if @opt.feeds and feeds.size >= @opt.feeds while not doc link = links.shift break unless link if dochistory.detect{|e| e == link.url} $stderr.puts "Skipping duplicate: #{link.url}" if @opt.verbose next end $stderr.puts "Fetching: #{link.url}" if @opt.verbose dochistory << link.url begin doc = fetchdoc(link) rescue Interrupt, SystemExit exit 1 rescue Exception $stderr.puts "Error: skipping unreadable document" end end break unless doc begin if doc.dom.root.name == "opml" newlinks = [] outlines = [] doc.dom.elements.each("/opml/body") do |body| body.elements.each() do |e| next unless e.name == 'outline' outlines << e end end while outlines.size>0 outline = outlines.shift url = outline.attributes["xmlUrl"] url = outline.attributes["url"] unless url if url begin url = URI.parse(doc.url).merge(url).to_s if doc.url link = OpenStruct.new link.url = url link.referrer = doc.url newlinks << link rescue URI::InvalidURIError end next end new_outlines = [] outline.elements.each() do |e| next unless e.name == 'outline' new_outlines << e end outlines = new_outlines + outlines end links = newlinks + links elsif doc.dom.root.name == "pcast" newlinks = [] XPath.each(doc.dom,"//link[@rel='feed']") do |outline| url = outline.attributes["href"] next unless url begin url = URI.parse(doc.url).merge(url).to_s if doc.url link = OpenStruct.new link.url = url link.referrer = doc.url newlinks << link rescue URI::InvalidURIError end end links = newlinks + links elsif doc.dom.root.namespace =~ @@ATOM_NS feed = [] XPath.each(doc.dom.root,"//*[@rel='enclosure']") do |e2| next unless e2.namespace =~ @@ATOM_NS content = OpenStruct.new XPath.each(e2,"parent::/title/text()") do |node| content.title = "" node.value.each_line() do |e3| #remove line breaks content.title+= e3.chomp+" " end content.title.strip! end XPath.each(e2,"parent::/created/text()") do |node| pub_date = "" node.value.each_line() do |e3| #remove line breaks pub_date+= e3.chomp+" " end begin content.pub_date = DateTime.parse(pub_date.strip, true) rescue Exception end end content.mime = e2.attributes["type"].downcase next if @opt.content_type !~ content.mime and content.mime != @@TORRENT next if content.mime == @@TORRENT and not (@opt.torrent_dir or @opt.rubytorrent) content.feedurl = doc.url begin content.url = URI.parse(content.feedurl).merge(e2.attributes["href"]).to_s if content.feedurl content.size = e2.attributes["length"].to_i content.size = 2 unless content.size and content.size>0 content.size = 0 if content.mime == @@TORRENT #not strictly necessary feed << content rescue URI::InvalidURIError end end #sort by date feed.sort!() do |a,b| if a.pub_date if b.pub_date b.pub_date <=> a.pub_date else -1 end else if b.pub_date 1 else 0 end end end feed.each() do |content| $stderr.puts "Enclosure: #{content.url}" end if @opt.verbose #title node = XPath.first(doc.dom,"/feed/title/text()") feed_title = "" node.value.each_line() do |e3| #remove line breaks feed_title += e3.chomp+" " end feed_title.strip! feed.each() do |content| content.feed_title = feed_title end # feeds << feed elsif doc.dom.root.name = "rss" feed = [] doc.dom.root.elements.each() do |e| #channel e.elements.each() do |e1| #item title = '' XPath.each(e1,"title/text()") do |node| title = '' node.value.each_line() do |e3| #remove line breaks title+= e3.chomp+" " end title.strip! end pub_date = nil XPath.each(e1,"pubDate/text()") do |node| pub_date = "" node.value.each_line() do |e3| #remove line breaks pub_date+= e3.chomp+" " end begin pub_date = DateTime.parse(pub_date.strip, true) rescue Exception pub_date = nil end end e1.elements.each() do |e2| if e2.name == "enclosure" content = OpenStruct.new content.title = title content.pub_date = pub_date content.mime = e2.attributes["type"].downcase next if @opt.content_type !~ content.mime and content.mime != @@TORRENT next if content.mime == @@TORRENT and not (@opt.torrent_dir or @opt.rubytorrent) content.feedurl = doc.url begin content.url = URI.parse(content.feedurl).merge(e2.attributes["url"]).to_s if content.feedurl content.size = e2.attributes["length"].to_i content.size = 2 unless content.size and content.size>0 content.size = 0 if content.mime == @@TORRENT #not strictly necessary feed << content rescue URI::InvalidURIError end elsif @@MEDIA_RSS_NS.include? e2.namespace case e2.name when 'content' content = OpenStruct.new content.title = title content.pub_date = pub_date content.mime = e2.attributes["type"].downcase next if @opt.content_type !~ content.mime and content.mime != @@TORRENT next if content.mime == @@TORRENT and not (@opt.torrent_dir or @opt.rubytorrent) content.feedurl = doc.url begin content.url = URI.parse(content.feedurl).merge(e2.attributes["url"]).to_s if content.feedurl content.size = e2.attributes["fileSize"].to_i content.size = 2 unless content.size and content.size>0 content.size = 0 if content.mime == @@TORRENT #not strictly necessary feed << content rescue URI::InvalidURIError end when 'group' e2.elements.each() do |e4| if e4.name == 'content' and @@MEDIA_RSS_NS.include?(e4.namespace) content = OpenStruct.new content.title = title content.pub_date = pub_date content.mime = e4.attributes["type"].downcase next if @opt.content_type !~ content.mime and content.mime != @@TORRENT next if content.mime == @@TORRENT and not (@opt.torrent_dir or @opt.rubytorrent) content.feedurl = doc.url begin content.url = URI.parse(content.feedurl).merge(e4.attributes["url"]).to_s if content.feedurl content.size = e4.attributes["fileSize"].to_i content.size = 2 unless content.size and content.size>0 content.size = 0 if content.mime == @@TORRENT #not strictly necessary feed << content rescue URI::InvalidURIError end break end end end end end if e1.name == "item" end if e.name == "channel" end #remove duplicates (duplication occurs in particular for content declared as both enclosure and Media RSS content) for i in 0...feed.size content = feed[i] next unless content for j in i+1...feed.size next unless feed[j] feed[j] = nil if feed[j].url == content.url end end feed.compact! #sort by date feed.sort!() do |a,b| if a.pub_date if b.pub_date b.pub_date <=> a.pub_date else -1 end else if b.pub_date 1 else 0 end end end feed.each() do |content| $stderr.puts "Enclosure: #{content.url}" end if @opt.verbose #title node = XPath.first(doc.dom,"//channel/title/text()") feed_title = "" node.value.each_line() do |e3| #remove line breaks feed_title += e3.chomp+" " end feed_title.strip! feed.each() do |content| content.feed_title = feed_title end #language if @opt.language.size > 0 loop do node = XPath.first doc.dom, '//channel/language/text()' break unless node break unless node.value feed_lang = node.value.strip.downcase.split '-' break if feed_lang.size == 0 langmatch = @opt.language.collect() do |lang| next false if feed_lang.size < lang.size matches = true for i in 0...lang.size next if lang[i] == feed_lang[i] matches = false end matches end feeds << feed if langmatch.include? true break end else feeds << feed end end rescue Interrupt, SystemExit exit 1 rescue Exception $stderr.puts "Error: skipping document because of an internal error" end doc = nil end #remove content older than the horizon date if @opt.horizon feeds.each() do |feed| for i in 0...feed.size if feed[i].pub_date feed[i] = nil if feed[i].pub_date < @opt.horizon else feed[i] = nil end end feed.compact! end end #apply download strategy @history.mark_old_content feeds if @opt.strategy == :chron or @opt.strategy == :chron_one or @opt.strategy == :chron_all feeds.each() do |feed| feed.reverse! end @opt.strategy = :back_catalog if @opt.strategy == :chron @opt.strategy = :one if @opt.strategy == :chron_one @opt.strategy = :all if @opt.strategy == :chron_all end case @opt.strategy #remove ignored content when :new feeds.each() do |feed| in_hist = nil for i in 0...feed.size if feed[i].in_history in_hist = i break end end feed.slice! in_hist...feed.size if in_hist end when :all else feeds.each() do |feed| for i in 0...feed.size feed[i] = nil if feed[i].in_history end feed.compact! end end if @opt.strategy == :new or @opt.strategy == :one feeds.each() do |feed| itemsize = 0 index = nil for i in 0...feed.size itemsize += feed[i].size if itemsize >= @opt.itemsize index = i+1 break end end feed.slice! index...feed.size if index end end #feed order case @opt.order when :random srand feeds.sort!() do |a,b| if a.size>0 if b.size>0 rand(3)-1 else -1 end else if b.size>0 1 else 0 end end end when :alphabetical feeds.sort!() do |a,b| if a.size>0 if b.size>0 a[0].feed_title <=> b[0].feed_title else -1 end else if b.size>0 1 else 0 end end end when :reverse feeds.reverse! end #remove duplicate content feeds.each() do |feed| feed.each() do |content| next unless content dup = false feeds.each() do |f| for i in 0...f.size next unless f[i] if f[i].url == content.url f[i] = nil if dup dup = true end $stderr.puts "Removed duplicate: #{content.url}" unless f[i] or (not @opt.verbose) end end end feed.compact! end #send usage statistics @stats.ping @opt, feeds #fetch torrent metainfo files feeds.each() do |feed| feed.each() do |content| next if content.mime != @@TORRENT content.mime = nil begin $stderr.puts "Fetching torrent metainfo: #{content.url}" if @opt.verbose content.metainfo = RubyTorrent::MetaInfo.from_location content.url content.size = content.metainfo.info.length content.mime = case content.metainfo.info.name.downcase when /\.mp3$/ "audio/mpeg" when /\.wma$/ "audio/x-ms-wma" when /\.mpg$|\.mpeg$|\.mpe$|\.mpa$|\.mp2$|\.mpv2$/ "video/mpeg" when /\.mov$|\.qt$/ "video/quicktime" when /\.avi$/ "video/x-msvideo" when /\.wmv$/ "video/x-ms-wmv" when /\.asf$/ "video/x-ms-asf" when /\.m4v$|\.mp4$|\.mpg4$/ "video/mp4" else nil end content.url = nil unless content.mime content.url = nil unless (@opt.content_type =~ content.mime) content.url = nil unless content.metainfo.info.single? rescue Interrupt content.url = nil $stderr.puts "Error: unreadable torrent metainfo" if @opt.verbose rescue SystemExit exit 1 rescue Exception content.url = nil $stderr.puts "Error: unreadable torrent metainfo" if @opt.verbose end end for i in 0...feed.size feed[i] = nil unless feed[i].url end feed.compact! end #fetch enclosures item = total = 0 @cache.each() do |e| total+= e.size end torrents = [] torrentfiles = [] inc = 1 while inc>0 inc = 0 itemsize = 0 feeds.each do |e| #find next enclosure in feed content = e.shift unless content itemsize = 0 next end #make place in cache while @opt.size and content.size+inc+total > @opt.size break if @opt.simulate f = @cache.shift break unless f total-= f.size parent = f.file.parent $stderr.puts "Deleting: #{f.file}" if @opt.verbose f.file.delete if parent.parent != @opt.dir and parent.entries.size == 2 #delete empty feed subfolder $stderr.puts "Deleting: #{parent}" if @opt.verbose parent.delete end end unless @opt.simulate break if @opt.size and content.size+inc+total > @opt.size end #download 1.upto(@opt.retries) do |i| begin if content.metainfo if @opt.torrent_dir loop do content.file = @opt.torrent_dir+(Time.now.to_f.to_s+".torrent") break unless content.file.exist? sleep 1 end $stderr.puts "Copying: #{content.url} to #{content.file}" if @opt.verbose and i == 1 if not @opt.simulate if content.feedurl and (content.feedurl =~ %r{^http:} or content.feedurl =~ %r{^ftp:}) open(content.url, "User-Agent" => USER_AGENT, "Referer" => content.feedurl) do |fin| content.file.open("wb") do |fout| fin.each_byte() do |b| fout.putc b end end end else open(content.url, "User-Agent" => USER_AGENT) do |fin| content.file.open("wb") do |fout| fin.each_byte() do |b| fout.putc b end end end end end else $stderr.puts "Fetching in background: #{content.url}" if @opt.verbose and i == 1 unless @opt.simulate content.file = filename(content, @cache_dir) package = RubyTorrent::Package.new content.metainfo, content.file.to_s bt = RubyTorrent::BitTorrent.new content.metainfo, package, :dlratelim => nil, :ulratelim => @opt.upload_rate, :http_proxy => ENV["http_proxy"] torrents << bt torrentfiles << content end inc+= content.size itemsize+= content.size end else $stderr.puts "Fetching: #{content.url} (#{content.size.to_s} bytes)" if @opt.verbose and i == 1 if not @opt.simulate headers = {"User-Agent" => USER_AGENT} headers["Referer"] = content.feedurl if content.feedurl and (content.feedurl =~ %r{^http:} or content.feedurl =~ %r{^ftp:}) content.download_url = content.url unless content.download_url open(content.download_url, headers) do |fin| if fin.base_uri.instance_of?(URI::HTTP) if fin.status[0] =~ Regexp.new('^3') content.download_url = fin.meta['location'] raise "redirecting" elsif fin.status[0] !~ Regexp.new('^2') raise 'failed' end end # write content to cache content.redirection_url = fin.base_uri.to_s # content.redirection_url is used for finding the correct filename in case of redirection content.redirection_url = nil if content.redirection_url.eql?(content.url) content.file = filename(content, @cache_dir) content.file.open("wb") do |fout| fin.each_byte() do |b| fout.putc b end end end content.size = content.file.size @history.add content end playlist.add(content) inc+= content.size itemsize+= content.size end break rescue Interrupt rescue SystemExit exit 1 rescue Exception end $stderr.puts "Attempt #{i} aborted" if @opt.verbose if content.file and i == @opt.retries if content.file.exist? parent = content.file.parent content.file.delete if parent.parent != @opt.dir and parent.entries.size == 2 #delete empty feed subfolder parent.delete end end content.file = nil end sleep 5 end redo unless content.file # skip unavailable enclosures redo if @opt.itemsize > itemsize itemsize = 0 end total+=inc end #shut down torrents if torrents.length > 0 $stderr.puts "Fetching torrents (duration: 30min to a couple of hours) " if @opt.verbose bt = torrents[0] completion = torrents.collect() do |e| e.percent_completed end while torrents.length > 0 sleep 30*60 for i in 0...torrents.length c = torrents[i].percent_completed complete = torrents[i].complete? $stderr.puts "Fetched: #{c}% of #{torrentfiles[i].url} " if @opt.verbose if complete or c == completion[i] begin torrents[i].shutdown rescue SystemExit exit 1 rescue Interrupt, Exception end if complete playlist.add(torrentfiles[i]) @history.add torrentfiles[i] else $stderr.puts "Aborted: #{torrentfiles[i].url}" if @opt.verbose begin torrentfiles[i].file.delete if torrentfiles[i].file.exist? torrentfiles[i] = nil rescue Interrupt, SystemExit exit 1 rescue Exception end end torrents[i] = nil torrentfiles[i] = nil completion[i] = nil next end completion[i] = c end torrents.compact! torrentfiles.compact! completion.compact! end begin bt.shutdown_all rescue Interrupt, SystemExit exit 1 rescue Exception end $stderr.puts "BitTorrent stopped" if @opt.verbose end playlist.finish @history.trim(@opt.memsize) unless @opt.simulate or @opt.strategy == :cache playlist.to_s end private def fetchdoc(link) doc = "" 1.upto(@opt.retries) do |i| begin if link.url =~ %r{^http:} or link.url =~ %r{^ftp:} if link.referrer and (link.referrer =~ %r{^http:} or link.referrer =~ %r{^ftp:}) open(link.url, "User-Agent" => USER_AGENT, "Referer" => link.referrer) do |f| break if f.content_type.index "audio/" break if f.content_type.index "video/" f.each_line() do |e| doc += e end end else open(link.url, "User-Agent" => USER_AGENT) do |f| break if f.content_type.index "audio/" break if f.content_type.index "video/" f.each_line() do |e| doc += e end end end else open(link.url) do |f| f.each_line() do |e| doc += e end end end break rescue Interrupt rescue SystemExit exit 1 rescue Exception end $stderr.puts "Attempt #{i} aborted" if @opt.verbose doc = "" sleep 5 end res = OpenStruct.new begin res.dom = Document.new doc rescue Exception end if res.dom res.url = link.url else res = nil end res end def filename(content, dir) #produce filename for content to be downloaded begin #per-feed subfolder if @opt.per_feed and content.feed_title and content.feed_title.size > 0 newdir = dir+content.feed_title newdir = dir+content.feed_title.gsub(/[\\\/:*?\"<>|!]/, ' ').gsub(/-+/,'-').gsub(/\s+/,' ').strip if @opt.restricted_names if newdir.exist? if newdir.directory? dir = newdir end else newdir.mkdir dir = newdir end end rescue Exception # $stderr.puts "error: #{$!}" end ext = [""] if content.metainfo begin ext = ["."+content.metainfo.info.name.split(".").reverse[0]] rescue Exception end else ext = case content.mime.downcase when "audio/mpeg" [".mp3"] when "audio/x-mpeg" [".mp3"] when "audio/x-ms-wma" [".wma"] when "audio/x-m4a" [".m4a"] when "video/mpeg" [".mpg",".mpeg",".mpe",".mpa",".mp2",".mpv2"] when "video/quicktime" [".mov",".qt"] when "video/x-msvideo" [".avi"] when "video/x-ms-wmv" [".wmv"] when "video/x-ms-asf" [".asf"] when "video/mp4" [".m4v", ".mp4",".mpg4"] when "video/x-m4v" [".m4v", ".mp4",".mpg4"] else [""] end end #name from url? name = nil begin if content.metainfo name = content.metainfo.info.name name = nil if (dir+name).exist? else urlname = nil urlname = URI.split(content.redirection_url)[5].split("/")[-1] if content.redirection_url urlname = URI.split(content.url)[5].split("/")[-1] unless urlname ext.each() do |e| if e.length == 0 or urlname[-e.length..-1].downcase == e name = urlname name = URI.unescape(name) name = nil if (dir+name).exist? break if name end end end rescue Exception end #unique name? loop do name = Time.now.to_f.to_s+ext[0] break unless (dir+name).exist? sleep 1 end unless name dir+name end end class OPML def initialize(title = nil) @doc = Document.new @doc.xml_decl.dowrite @doc.add_element Element.new("opml") @doc.root.add_attribute "version", "1.1" head = Element.new("head") @doc.root.add_element head if title titlee = Element.new("title") titlee.text = title head.add_element titlee end @body = Element.new("body") @doc.root.add_element @body @size = 0 end def add(feedurl, text=nil) e = Element.new("outline") e.add_attribute("text", text) if text e.add_attribute "type", "link" e.add_attribute "url", feedurl @body.add_element e @size += 1 end def write() @doc.write $stdout, 0 end def size() @size end end class Query def initialize(opt, query) @@ATOM_NS = Regexp.new '^http://purl.org/atom/ns#' @@ITUNES_NS = 'http://www.itunes.com/dtds/podcast-1.0.dtd' @opt = opt if query @query = query.downcase.split @query = nil if @query.size == 0 end @stats = Stats.new opt.dir end def search(urls) res = [] begin newpaths = [] dochistory = [] paths = [] if urls.size == 0 $stderr.puts "Reading subscriptions from standard input" if @opt.verbose begin xml = "" $stdin.each() do |e| xml += e end path = OpenStruct.new path.doc = Document.new(xml) if path.doc and path.doc.root path.relevance = 0 newpaths << path end rescue Interrupt, SystemExit raise rescue Exception $stderr.puts "Error: unreadable subscriptions" end else newpaths = urls.uniq.collect() do |e| path = OpenStruct.new path.url = e path end newpaths = newpaths.collect() do |path| $stderr.puts "Fetching: #{path.url}" if @opt.verbose dochistory << path.url path.doc = fetchdoc(path) if path.doc path.relevance = 0 path else $stderr.puts "Skipping unreadable document" if @opt.verbose nil end end newpaths.compact! end #send usage statistics @stats.ping_search @opt, @query.join(' ') # loop do break if @opt.feeds and res.size >= @opt.feeds begin newpaths.sort!() do |path1, path2| path2.relevance <=> path1.relevance end paths = newpaths + paths newpaths = [] path = nil loop do path = paths.shift break unless path if path.doc break else if dochistory.detect{|e| e == path.url} $stderr.puts "Skipping duplicate: #{path.url}" if @opt.verbose next end $stderr.puts "Fetching: #{path.url}" if @opt.verbose dochistory << path.url path.doc = fetchdoc(path) if path.doc break end $stderr.puts "Error: skipping unreadable document" end end break unless path if path.doc.root.name == "opml" #doc relevance path.relevance += relevance_of(XPath.first(path.doc, "/opml/head/title/text()")) #outgoing links XPath.each(path.doc,"//outline") do |outline| url = outline.attributes["xmlUrl"] url = outline.attributes["url"] unless url next unless url begin url = URI.parse(path.url).merge(url).to_s if path.url rescue Interrupt, SystemExit raise rescue Exception end newpath = OpenStruct.new newpath.url = url newpath.referrer = path.url #link relevance newpath.relevance = path.relevance XPath.each(outline, "ancestor-or-self::outline") do |e| newpath.relevance += relevance_of(e.attributes["text"]) end # newpaths << newpath end elsif path.doc.root.name == "pcast" #outgoing links XPath.each(path.doc,"/pcast/channel") do |channel| link = XPath.first(channel, "link[@rel='feed']") next unless link url = link.attributes["href"] next unless url begin url = URI.parse(path.url).merge(url).to_s if path.url rescue Interrupt, SystemExit raise rescue Exception end newpath = OpenStruct.new newpath.url = url newpath.referrer = path.url #link relevance newpath.relevance = path.relevance newpath.relevance += relevance_of(XPath.first(channel, "title/text()")) newpath.relevance += relevance_of(XPath.first(channel, "subtitle/text()")) # newpaths << newpath end elsif path.doc.root.namespace =~ @@ATOM_NS and path.url #doc relevance title = nil begin XPath.each(path.doc.root,"/*/*") do |e| next unless e.namespace =~ @@ATOM_NS next unless e.name == "title" or e.name == "subtitle" title = e.text if e.name == "title" path.relevance += relevance_of(e.text) end rescue Interrupt, SystemExit raise rescue Exception #$stderr.puts "error: #{$!}" end if path.relevance > 0 $stderr.puts "Found: #{title} (relevance: #{path.relevance})" if @opt.verbose if title path.title = "" title.value.each() do |e3| #remove line breaks path.title+= e3.chomp+" " end path.title.strip! end res << path end elsif path.doc.root.name = "rss" and path.url #doc relevance title = XPath.first(path.doc, "//channel/title/text()") path.relevance += relevance_of(title) path.relevance += relevance_of(XPath.first(path.doc, "//channel/description/text()")) begin XPath.each(path.doc.root,"//channel/*") do |e| next unless e.name == "category" if e.namespace == @@ITUNES_NS XPath.each(e, "descendant-or-self::*") do |e2| next unless e2.name == "category" path.relevance += relevance_of(e2.attributes["text"]) end else path.relevance += relevance_of(e.text) end end rescue Interrupt, SystemExit raise rescue Exception #$stderr.puts "error: #{$!}" end if path.relevance > 0 $stderr.puts "Found: #{title} (relevance: #{path.relevance})" if @opt.verbose if title path.title = "" title.value.each() do |e3| #remove line breaks path.title+= e3.chomp+" " end path.title.strip! end res << path end end rescue Interrupt, SystemExit raise rescue Exception $stderr.puts "Error: skipping unreadable document" end end rescue Interrupt, SystemExit $stderr.puts "Execution interrupted" rescue Exception end result = nil while not result begin res.sort!() do |path1, path2| path2.relevance <=> path1.relevance end opml = OPML.new "Search results for \"#{@query.collect(){|e| "#{e} "}}\"" res.each() do |path| opml.add path.url, path.title end result = opml rescue Exception end end result.write result end private def relevance_of(meta) return 0 unless meta unless meta.kind_of? String #Text todo: resolve entities meta = meta.value end meta = meta.downcase meta = meta.split res = 0 @query.each() do |e| meta.each() do |e2| res += 1 if e2.index(e) end end res end def fetchdoc(link) doc = "" 1.upto(@opt.retries) do |i| begin if link.url =~ %r{^http:} or link.url =~ %r{^ftp:} if link.referrer and (link.referrer =~ %r{^http:} or link.referrer =~ %r{^ftp:}) open(link.url, "User-Agent" => USER_AGENT, "Referer" => link.referrer) do |f| break if f.content_type.index "audio/" break if f.content_type.index "video/" f.each_line() do |e| doc += e end end else open(link.url, "User-Agent" => USER_AGENT) do |f| break if f.content_type.index "audio/" break if f.content_type.index "video/" f.each_line() do |e| doc += e end end end else open(link.url) do |f| f.each_line() do |e| doc += e end end end break rescue Interrupt rescue SystemExit break rescue Exception end $stderr.puts "Attempt #{i} aborted" if @opt.verbose doc = "" sleep 5 end res = nil begin res = Document.new doc rescue Exception end res = nil unless res and res.root res end end opt.size *= 1_000_000 if opt.size opt.upload_rate *= 1024 if opt.upload_rate opt.itemsize *= 1_000_000 arguments = arguments + ARGV unless opt.check_for_update $stderr.puts "Disabling update check." if opt.verbose end unless opt.vote $stderr.puts "Disabling the sending of anonymous usage statistics." if opt.verbose end begin require "rubytorrent" opt.rubytorrent = true $stderr.puts "RubyTorrent detected." if opt.verbose rescue Interrupt, SystemExit exit 1 rescue Exception end if opt.function == :download cache = Cache.new opt cache.createplaylist arguments elsif opt.function == :search dir = Query.new opt, arguments.shift dir.search arguments end if opt.check_for_update update = Update.new opt.dir update.check end if opt.verbose $stderr.puts "" $stderr.puts " *********************************************************************" $stderr.puts " **** Qworum - A platform for web-based services (sponsor) ****" $stderr.puts " *********************************************************************" $stderr.puts " **** Sell and buy services: ****" $stderr.puts " **** Host services on your own domain; sell them to websites ****" $stderr.puts " **** or businesses on the service marketplace. ****" $stderr.puts " **** ****" $stderr.puts " **** Build enterprise information systems: ****" $stderr.puts " **** Use Qworum in your information system, and enjoy the ****" $stderr.puts " **** benefits of a powerful SOA technology. ****" $stderr.puts " **** ****" $stderr.puts " **** Learn more at http://www.qworum.com/ ****" $stderr.puts " *********************************************************************" $stderr.puts "" end $stderr.puts "End of podcatching session." if opt.verbose