(ns ldview.tasks.scrape (:require [net.cgrand.enlive-html :as html] [clj-http.client :as http] [clojure.java.io :as io]) (:use [clojure.string :only (split)])) ; Some helper functions to construct proper urls. If the Ludum Date Website ; changes some URL stuff, this is the place to crank. (def competition 27) (defn url-action [action] (str "http://www.ludumdare.com/compo/ludum-dare-" competition "/?action=" action)) (defn url-page [page] (str (url-action "preview") "&start=" page)) (defn url-entry [entry-id] (str (url-action "preview") "&uid=" entry-id)) ; Helpers to fetch an html resource or save an url from the web into a ; local file. This thow functions will be needed to pull the content from ; the website. (defn fetch-url [url] (html/html-resource (java.net.URL. url))) (defn save-image-from-url [url target-file] (with-open [bodystream (:body (http/get url {:as :stream}))] (io/copy bodystream (io/file target-file)))) ; The actual scraping process. We crawl through the entire content results ; and fetch the relevant information from the DOM. (defn number-of-pages [] (let [p (second (html/select (fetch-url (url-action "preview")) [:div#compo2 :> :p]))] (read-string (html/text (last (butlast (html/select [p] [:a]))))))) (defn entries-on-page [page] (let [tds (html/select (fetch-url (url-page page)) #{[:.alt-1], [:.alt-2]}) links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))] (map #(last (split %1 #"=")) links))) (defn links-on-entry [content] (map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select [content] [:p.links :> :a]))) (defn images-on-entry [content] (map #(:href (:attrs %1)) (html/select [content] [:table html/first-child :a]))) (defn entry-details [entry-id] (let [content (first (html/select (fetch-url (url-entry entry-id)) [:div#compo2])) title-parts (first (html/texts (html/select [content] [:h3]))) [title author] (split title-parts #" - ") links (links-on-entry content) description (html/text (nth (html/select [content] [:p]) 2)) images (images-on-entry content)] {:ld_uid entry-id :title title :description description :author author :links links :images images}))