Scrape the stuff from LD
This commit is contained in:
parent
7baeda8ca0
commit
9809403e19
1 changed files with 67 additions and 0 deletions
67
src/luduverse/ld-scraper.clj
Normal file
67
src/luduverse/ld-scraper.clj
Normal file
|
@ -0,0 +1,67 @@
|
|||
(ns luduverse.ld-scraper
|
||||
(:require [net.cgrand.enlive-html :as html]
|
||||
[clj-http.client :as http]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.string :refer [split]]))
|
||||
|
||||
;; NOTE: This part is grabbed from the open source lib
|
||||
;; https://github.com/arg-games/ldview
|
||||
|
||||
(defn url-action [competition-id action]
|
||||
(str "http://www.ludumdare.com/compo/ludum-dare-" competition-id "/?action=" action))
|
||||
|
||||
(defn url-page [competition-id page]
|
||||
(str (url-action competition-id "preview") "&start=" (* page 24)))
|
||||
|
||||
(defn url-entry [competition-id entry-id]
|
||||
(str (url-action competition-id "preview") "&uid=" entry-id))
|
||||
|
||||
; Helpers to fetch an html resource or save an url from the web into a
|
||||
; local file. This thow functions will be needed to pull the content from
|
||||
; the website.
|
||||
(defn fetch-url [url]
|
||||
(html/html-resource (java.net.URL. url)))
|
||||
|
||||
(defn save-image-from-url [url target-file]
|
||||
(with-open [bodystream (:body (http/get url {:as :stream}))]
|
||||
(io/copy bodystream (io/file target-file))))
|
||||
|
||||
|
||||
; The actual scraping process. We crawl through the entire content results
|
||||
; and fetch the relevant information from the DOM.
|
||||
(defn number-of-pages [competition-id]
|
||||
(let [p (second (html/select (fetch-url (url-action competition-id "preview")) [:div#compo2 :> :p]))]
|
||||
(read-string (html/text (last (butlast (html/select [p] [:a])))))))
|
||||
|
||||
(defn entries-on-page [competition-id page]
|
||||
(let [tds (html/select (fetch-url (url-page competition-id page)) #{[:.alt-1], [:.alt-2]})
|
||||
links (map #(:href %1) (map #(:attrs (first (html/select [%1] [:a]))) tds))]
|
||||
(map #(last (split %1 #"=")) links)))
|
||||
|
||||
(defn theme [competition-id]
|
||||
(let [p (html/select (fetch-url (url-action competition-id "preview")) [:div#content :> :div.post :> :div.entry :> :p html/first-child :a])]
|
||||
(first (:content (first p)))))
|
||||
|
||||
(defn links-on-entry [content]
|
||||
(map (fn [x] {:title (first (:content x)) :url (:href (:attrs x))}) (html/select [content] [:p.links :> :a])))
|
||||
|
||||
(defn images-on-entry [content]
|
||||
(map #(:href (:attrs %1)) (html/select [content] [:table html/first-child :a])))
|
||||
|
||||
(defn format-entry-type [unformatted-type]
|
||||
(if (= unformatted-type "Jam Entry") "jam" "compo"))
|
||||
|
||||
(defn entry-details [competition-id entry-id]
|
||||
(let [content (first (html/select (fetch-url (url-entry competition-id entry-id)) [:div#compo2]))
|
||||
title-parts (first (html/texts (html/select [content] [:h3])))
|
||||
[title author unformatted-type] (split title-parts #" - ")
|
||||
links (links-on-entry content)
|
||||
description (html/text (nth (html/select [content] [:p]) 2))
|
||||
images (images-on-entry content)]
|
||||
{:ld_uid entry-id
|
||||
:title title
|
||||
:description description
|
||||
:author author
|
||||
:type (format-entry-type unformatted-type)
|
||||
:links links
|
||||
:images images}))
|
Loading…
Reference in a new issue