clojurecup2014-luduverse/src/luduverse/ld-scraper.clj

(ns luduverse.ld-scraper
  (:require [net.cgrand.enlive-html :as html]
            [clojure.java.io :as io]
            [clojure.string :refer [split trim]]))

;; NOTE: This part is grabbed from the open source lib
;; https://github.com/arg-games/ldview

(defn url-action [competition-id action]
  (str "http://www.ludumdare.com/compo/ludum-dare-" competition-id "/?action=" action))

(defn url-page [competition-id page]
  (str (url-action competition-id "preview") "&start=" (* page 24)))

(defn url-entry [competition-id entry-id]
  (str (url-action competition-id "preview") "&uid=" entry-id))

; Helpers to fetch an html resource or save an url from the web into a
; local file. This thow functions will be needed to pull the content from
; the website.
(defn fetch-url [url]
  (html/html-resource (java.net.URL. url)))


; The actual scraping process. We crawl through the entire content results
; and fetch the relevant information from the DOM.
(defn number-of-pages [competition-id]
  (let [p (second (html/select (fetch-url (url-action competition-id "preview")) [:div#compo2 :> :p]))]
    (read-string (html/text (last (butlast (html/select [p] [:a])))))))

(defn entries-on-page [competition-id page]
  (let [tds (html/select (fetch-url (url-page competition-id page)) #{[:.alt-1], [:.alt-2]})
        links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))]
    (map #(last (split %1 #"=")) links)))

(defn theme
  "We pull the themes for the competition from wikipedia because the LD page
places the theme on random places and I don't want to build up 25 different
selectors for 28 different places. This does simply not work and I give up
and simply fetch it from Wikipedia. End of discussion :)"
  [competition-id]
  (-> (fetch-url "http://en.wikipedia.org/wiki/Ludum_Dare")
      (html/select [:table.wikitable (html/nth-child (+ 3 competition-id)) (html/nth-child 3)])
      first :content first
      trim))

(defn links-on-entry [content]
  (map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select content [:p.links :> :a])))

(defn images-on-entry [content]
  (map #(:href (:attrs %1)) (html/select content [:table html/first-child :a])))

(defn format-entry-type [unformatted-type]
  (if (= unformatted-type "Jam Entry") "jam" "compo"))

(defn entry-details [competition-id entry-id]
  (let [content (first (html/select (fetch-url (url-entry competition-id entry-id)) [:div#compo2]))
        title-parts (first (html/texts (html/select [content] [:h3])))
        [title author unformatted-type] (split title-parts #" - ")
        links (links-on-entry content)
        description (html/text (nth (html/select [content] [:p]) 2))
        images (images-on-entry content)]
    {:ld_uid entry-id
     :title title
     :description description
     :author author
     :type (format-entry-type unformatted-type)
     :links links
     :images images}))


;; Save all what we get
(defn save-entry [competition-id entry]
  "Here we save the stuff ...")
Scrape the stuff from LD 2014-09-27 18:37:27 +02:00			`(ns luduverse.ld-scraper`
			`(:require [net.cgrand.enlive-html :as html]`
			`[clojure.java.io :as io]`
Working on the content grabbing mechanism :) 2014-09-28 00:09:23 +02:00			`[clojure.string :refer [split trim]]))`
Scrape the stuff from LD 2014-09-27 18:37:27 +02:00
			`;; NOTE: This part is grabbed from the open source lib`
			`;; https://github.com/arg-games/ldview`

			`(defn url-action [competition-id action]`
			`(str "http://www.ludumdare.com/compo/ludum-dare-" competition-id "/?action=" action))`

			`(defn url-page [competition-id page]`
			`(str (url-action competition-id "preview") "&start=" (* page 24)))`

			`(defn url-entry [competition-id entry-id]`
			`(str (url-action competition-id "preview") "&uid=" entry-id))`

			`; Helpers to fetch an html resource or save an url from the web into a`
			`; local file. This thow functions will be needed to pull the content from`
			`; the website.`
			`(defn fetch-url [url]`
			`(html/html-resource (java.net.URL. url)))`


			`; The actual scraping process. We crawl through the entire content results`
			`; and fetch the relevant information from the DOM.`
			`(defn number-of-pages [competition-id]`
			`(let [p (second (html/select (fetch-url (url-action competition-id "preview")) [:div#compo2 :> :p]))]`
			`(read-string (html/text (last (butlast (html/select [p] [:a])))))))`

			`(defn entries-on-page [competition-id page]`
			`(let [tds (html/select (fetch-url (url-page competition-id page)) #{[:.alt-1], [:.alt-2]})`
			`links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))]`
			`(map #(last (split %1 #"=")) links)))`

Working on the content grabbing mechanism :) 2014-09-28 00:09:23 +02:00			`(defn theme`
			`"We pull the themes for the competition from wikipedia because the LD page`
			`places the theme on random places and I don't want to build up 25 different`
			`selectors for 28 different places. This does simply not work and I give up`
			`and simply fetch it from Wikipedia. End of discussion :)"`
			`[competition-id]`
			`(-> (fetch-url "http://en.wikipedia.org/wiki/Ludum_Dare")`
			`(html/select [:table.wikitable (html/nth-child (+ 3 competition-id)) (html/nth-child 3)])`
			`first :content first`
			`trim))`
Scrape the stuff from LD 2014-09-27 18:37:27 +02:00
			`(defn links-on-entry [content]`
Working on the content grabbing mechanism :) 2014-09-28 00:09:23 +02:00			`(map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select content [:p.links :> :a])))`
Scrape the stuff from LD 2014-09-27 18:37:27 +02:00
			`(defn images-on-entry [content]`
Working on the content grabbing mechanism :) 2014-09-28 00:09:23 +02:00			`(map #(:href (:attrs %1)) (html/select content [:table html/first-child :a])))`
Scrape the stuff from LD 2014-09-27 18:37:27 +02:00
			`(defn format-entry-type [unformatted-type]`
			`(if (= unformatted-type "Jam Entry") "jam" "compo"))`

			`(defn entry-details [competition-id entry-id]`
			`(let [content (first (html/select (fetch-url (url-entry competition-id entry-id)) [:div#compo2]))`
			`title-parts (first (html/texts (html/select [content] [:h3])))`
			`[title author unformatted-type] (split title-parts #" - ")`
			`links (links-on-entry content)`
			`description (html/text (nth (html/select [content] [:p]) 2))`
			`images (images-on-entry content)]`
			`{:ld_uid entry-id`
			`:title title`
			`:description description`
			`:author author`
			`:type (format-entry-type unformatted-type)`
			`:links links`
			`:images images}))`
Working on the content grabbing mechanism :) 2014-09-28 00:09:23 +02:00

			`;; Save all what we get`
			`(defn save-entry [competition-id entry]`
			`"Here we save the stuff ...")`