From 9809403e19cd147162eda8bc92f1e1a07e5372ed Mon Sep 17 00:00:00 2001 From: Aaron Mueller Date: Sat, 27 Sep 2014 18:37:27 +0200 Subject: [PATCH] Scrape the stuff from LD --- src/luduverse/ld-scraper.clj | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 src/luduverse/ld-scraper.clj diff --git a/src/luduverse/ld-scraper.clj b/src/luduverse/ld-scraper.clj new file mode 100644 index 0000000..534dcab --- /dev/null +++ b/src/luduverse/ld-scraper.clj @@ -0,0 +1,67 @@ +(ns luduverse.ld-scraper + (:require [net.cgrand.enlive-html :as html] + [clj-http.client :as http] + [clojure.java.io :as io] + [clojure.string :refer [split]])) + +;; NOTE: This part is grabbed from the open source lib +;; https://github.com/arg-games/ldview + +(defn url-action [competition-id action] + (str "http://www.ludumdare.com/compo/ludum-dare-" competition-id "/?action=" action)) + +(defn url-page [competition-id page] + (str (url-action competition-id "preview") "&start=" (* page 24))) + +(defn url-entry [competition-id entry-id] + (str (url-action competition-id "preview") "&uid=" entry-id)) + +; Helpers to fetch an html resource or save an url from the web into a +; local file. This thow functions will be needed to pull the content from +; the website. +(defn fetch-url [url] + (html/html-resource (java.net.URL. url))) + +(defn save-image-from-url [url target-file] + (with-open [bodystream (:body (http/get url {:as :stream}))] + (io/copy bodystream (io/file target-file)))) + + +; The actual scraping process. We crawl through the entire content results +; and fetch the relevant information from the DOM. +(defn number-of-pages [competition-id] + (let [p (second (html/select (fetch-url (url-action competition-id "preview")) [:div#compo2 :> :p]))] + (read-string (html/text (last (butlast (html/select [p] [:a]))))))) + +(defn entries-on-page [competition-id page] + (let [tds (html/select (fetch-url (url-page competition-id page)) #{[:.alt-1], [:.alt-2]}) + links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))] + (map #(last (split %1 #"=")) links))) + +(defn theme [competition-id] + (let [p (html/select (fetch-url (url-action competition-id "preview")) [:div#content :> :div.post :> :div.entry :> :p html/first-child :a])] + (first (:content (first p))))) + +(defn links-on-entry [content] + (map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select [content] [:p.links :> :a]))) + +(defn images-on-entry [content] + (map #(:href (:attrs %1)) (html/select [content] [:table html/first-child :a]))) + +(defn format-entry-type [unformatted-type] + (if (= unformatted-type "Jam Entry") "jam" "compo")) + +(defn entry-details [competition-id entry-id] + (let [content (first (html/select (fetch-url (url-entry competition-id entry-id)) [:div#compo2])) + title-parts (first (html/texts (html/select [content] [:h3]))) + [title author unformatted-type] (split title-parts #" - ") + links (links-on-entry content) + description (html/text (nth (html/select [content] [:p]) 2)) + images (images-on-entry content)] + {:ld_uid entry-id + :title title + :description description + :author author + :type (format-entry-type unformatted-type) + :links links + :images images}))