(ns luduverse.scraper (:require [net.cgrand.enlive-html :as html] [clojure.java.io :as io] [clojure.string :refer [split trim]] [luduverse.images :as image] [luduverse.models.entry :as entry] [luduverse.models.competition :as competition])) ;; NOTE: This part is grabbed from the open source lib ;; https://github.com/arg-games/ldview (defn url-action [competition-id action] (str "http://www.ludumdare.com/compo/ludum-dare-" competition-id "/?action=" action)) (defn url-page [competition-id page] (str (url-action competition-id "preview") "&start=" (* page 24))) (defn url-entry [competition-id entry-id] (str (url-action competition-id "preview") "&uid=" entry-id)) ; Helpers to fetch an html resource or save an url from the web into a ; local file. This thow functions will be needed to pull the content from ; the website. (defn fetch-url [url] (html/html-resource (java.net.URL. url))) ; The actual scraping process. We crawl through the entire content results ; and fetch the relevant information from the DOM. (defn number-of-pages [competition-id] (let [p (second (html/select (fetch-url (url-action competition-id "preview")) [:div#compo2 :> :p]))] (read-string (html/text (last (butlast (html/select [p] [:a]))))))) (defn entries-on-page [competition-id page] (let [tds (html/select (fetch-url (url-page competition-id page)) #{[:.alt-1], [:.alt-2]}) links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))] (map #(last (split %1 #"=")) links))) (defn theme "We pull the themes for the competition from wikipedia because the LD page places the theme on random places and I don't want to build up 25 different selectors for 28 different places. This does simply not work and I give up and simply fetch it from Wikipedia. End of discussion :)" [competition-id] (-> (fetch-url "http://en.wikipedia.org/wiki/Ludum_Dare") (html/select [:table.wikitable (html/nth-child (+ 3 competition-id)) (html/nth-child 3)]) first :content first trim)) (defn links-on-entry [content] (map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select content [:p.links :> :a]))) (defn images-on-entry [content] (map #(:href (:attrs %1)) (html/select content [:table html/first-child :a]))) (defn format-entry-type [unformatted-type] (if (= unformatted-type "Jam Entry") "jam" "compo")) (defn entry-details [competition-id entry-id] (let [content (first (html/select (fetch-url (url-entry competition-id entry-id)) [:div#compo2])) title-parts (first (html/texts (html/select [content] [:h3]))) [title username unformatted-type] (split title-parts #" - ") links (links-on-entry content) description (html/text (nth (html/select [content] [:p]) 2)) images (images-on-entry content)] {:ld_uid entry-id :title title :description description :username username :type (format-entry-type unformatted-type) :links links :images images})) ;; Save what we got (defn save-entry [competition-id entry-id] (let [entry (entry-details competition-id entry-id)] (if-not (competition/exists? competition-id) (competition/create! competition-id (theme competition-id))) (if-not (entry/exists? (:ld_uid entry)) (entry/create! entry)) (image/save-images-for-entry competition-id entry))) (defn save-page [competition-id page] (doall (pmap #(save-entry competition-id %) (entries-on-page competition-id page))) true) (defn save-competition [competition-id] (doseq [page (range (number-of-pages competition-id))] (save-page competition-id page)) true)