Write a runner for the web scraping stuff
This commit is contained in:
parent
d2d60cf48d
commit
452e3d3245
3 changed files with 43 additions and 12 deletions
37
src/ldview/tasks/runner.clj
Normal file
37
src/ldview/tasks/runner.clj
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
(ns ldview.tasks.runner
|
||||||
|
(:require [ldview.tasks.scrape :as scrape]
|
||||||
|
[ldview.tasks.images :as images]
|
||||||
|
[me.raynes.fs :as fs])
|
||||||
|
(:use [ldview.util]))
|
||||||
|
|
||||||
|
(defn cleanup []
|
||||||
|
(if (fs/exists? *image-base-path*)
|
||||||
|
(fs/delete-dir *image-base-path*))
|
||||||
|
(fs/mkdirs (str *image-base-path* "/thumbs/"))
|
||||||
|
(fs/mkdirs (str *image-base-path* "/fullscreen/"))
|
||||||
|
(fs/mkdirs (str *image-base-path* "/raw/")))
|
||||||
|
|
||||||
|
(defn save-entry [entry]
|
||||||
|
; TODO: Save to Database
|
||||||
|
(if (:images entry)
|
||||||
|
(map (fn [image-url]
|
||||||
|
(let [id (:id entry)
|
||||||
|
number (last (first (re-seq #"shot([0-9]+)" image-url)))
|
||||||
|
raw-image-path (images/image-name "raw" number)]
|
||||||
|
(scrape/save-image-from-url image-url raw-image-path)
|
||||||
|
(images/sourceimage->fullscreen raw-image-path (images/image-name "fullscreen" number))
|
||||||
|
(images/sourceimage->thumb raw-image-path (images/image-name "thumbs" number))))
|
||||||
|
(:images entry))))
|
||||||
|
|
||||||
|
; TODO: Make it multithreaded
|
||||||
|
(defn fetch-all-content []
|
||||||
|
(let [pages 1] ;(scrape/number-of-pages)]
|
||||||
|
(map (fn [page]
|
||||||
|
(map save-entry (scrape/fetch-page 1)))
|
||||||
|
(range pages))))
|
||||||
|
|
||||||
|
|
||||||
|
(defn start []
|
||||||
|
(cleanup)
|
||||||
|
(fetch-all-content))
|
||||||
|
|
|
@ -22,9 +22,9 @@
|
||||||
(defn fetch-url [url]
|
(defn fetch-url [url]
|
||||||
(html/html-resource (java.net.URL. url)))
|
(html/html-resource (java.net.URL. url)))
|
||||||
|
|
||||||
(defn save-image-from-url [url target-path new-name]
|
(defn save-image-from-url [url target-file]
|
||||||
(with-open [bodystream (:body (http/get url {:as :stream}))]
|
(with-open [bodystream (:body (http/get url {:as :stream}))]
|
||||||
(io/copy bodystream (io/file (str target-path new-name)))))
|
(io/copy bodystream (io/file target-file))))
|
||||||
|
|
||||||
|
|
||||||
; The actual scraping process. We crawl through the entire content results
|
; The actual scraping process. We crawl through the entire content results
|
||||||
|
@ -51,16 +51,12 @@
|
||||||
links (links-on-entry content)
|
links (links-on-entry content)
|
||||||
description (html/text (nth (html/select [content] [:p]) 2))
|
description (html/text (nth (html/select [content] [:p]) 2))
|
||||||
images (images-on-entry content)]
|
images (images-on-entry content)]
|
||||||
{:title title
|
{:id entry-id
|
||||||
|
:title title
|
||||||
:author author
|
:author author
|
||||||
:description description
|
:description description
|
||||||
:links links
|
:links links
|
||||||
:images images}))
|
:images images}))
|
||||||
|
|
||||||
; Do the job
|
|
||||||
; TODO: Make it multithreaded and resumable.
|
|
||||||
(defn fetch-page [page]
|
(defn fetch-page [page]
|
||||||
(map entry-details (entries-on-page page)))
|
(map entry-details (entries-on-page page)))
|
||||||
|
|
||||||
(defn fetch-all []
|
|
||||||
(map fetch-page (number-of-pages)))
|
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
(ns ldview.util
|
(ns ldview.util)
|
||||||
(:require [noir.io :as io]
|
|
||||||
[markdown.core :as md]))
|
|
||||||
|
|
||||||
; Global stuff that does not change over time and needed everywhere
|
; Global stuff that does not change over time and needed everywhere
|
||||||
(def ^:dynamic *base-url* "http://www.ludumdare.com/compo/")
|
(def ^:dynamic *base-url* "http://www.ludumdare.com/compo/")
|
||||||
(def ^:dynamic *image-base-path* "content/images")
|
(def ^:dynamic *image-base-path* "resources/public/img/ld")
|
||||||
|
|
||||||
(def ^:dynamic *competition* 27)
|
(def ^:dynamic *competition* 27)
|
||||||
|
|
Loading…
Reference in a new issue