Write a runner for the web scraping stuff
This commit is contained in:
parent
d2d60cf48d
commit
452e3d3245
3 changed files with 43 additions and 12 deletions
37
src/ldview/tasks/runner.clj
Normal file
37
src/ldview/tasks/runner.clj
Normal file
|
@ -0,0 +1,37 @@
|
|||
(ns ldview.tasks.runner
|
||||
(:require [ldview.tasks.scrape :as scrape]
|
||||
[ldview.tasks.images :as images]
|
||||
[me.raynes.fs :as fs])
|
||||
(:use [ldview.util]))
|
||||
|
||||
(defn cleanup []
|
||||
(if (fs/exists? *image-base-path*)
|
||||
(fs/delete-dir *image-base-path*))
|
||||
(fs/mkdirs (str *image-base-path* "/thumbs/"))
|
||||
(fs/mkdirs (str *image-base-path* "/fullscreen/"))
|
||||
(fs/mkdirs (str *image-base-path* "/raw/")))
|
||||
|
||||
(defn save-entry [entry]
|
||||
; TODO: Save to Database
|
||||
(if (:images entry)
|
||||
(map (fn [image-url]
|
||||
(let [id (:id entry)
|
||||
number (last (first (re-seq #"shot([0-9]+)" image-url)))
|
||||
raw-image-path (images/image-name "raw" number)]
|
||||
(scrape/save-image-from-url image-url raw-image-path)
|
||||
(images/sourceimage->fullscreen raw-image-path (images/image-name "fullscreen" number))
|
||||
(images/sourceimage->thumb raw-image-path (images/image-name "thumbs" number))))
|
||||
(:images entry))))
|
||||
|
||||
; TODO: Make it multithreaded
|
||||
(defn fetch-all-content []
|
||||
(let [pages 1] ;(scrape/number-of-pages)]
|
||||
(map (fn [page]
|
||||
(map save-entry (scrape/fetch-page 1)))
|
||||
(range pages))))
|
||||
|
||||
|
||||
(defn start []
|
||||
(cleanup)
|
||||
(fetch-all-content))
|
||||
|
|
@ -22,9 +22,9 @@
|
|||
(defn fetch-url [url]
|
||||
(html/html-resource (java.net.URL. url)))
|
||||
|
||||
(defn save-image-from-url [url target-path new-name]
|
||||
(defn save-image-from-url [url target-file]
|
||||
(with-open [bodystream (:body (http/get url {:as :stream}))]
|
||||
(io/copy bodystream (io/file (str target-path new-name)))))
|
||||
(io/copy bodystream (io/file target-file))))
|
||||
|
||||
|
||||
; The actual scraping process. We crawl through the entire content results
|
||||
|
@ -51,16 +51,12 @@
|
|||
links (links-on-entry content)
|
||||
description (html/text (nth (html/select [content] [:p]) 2))
|
||||
images (images-on-entry content)]
|
||||
{:title title
|
||||
{:id entry-id
|
||||
:title title
|
||||
:author author
|
||||
:description description
|
||||
:links links
|
||||
:images images}))
|
||||
|
||||
; Do the job
|
||||
; TODO: Make it multithreaded and resumable.
|
||||
(defn fetch-page [page]
|
||||
(map entry-details (entries-on-page page)))
|
||||
|
||||
(defn fetch-all []
|
||||
(map fetch-page (number-of-pages)))
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
(ns ldview.util
|
||||
(:require [noir.io :as io]
|
||||
[markdown.core :as md]))
|
||||
(ns ldview.util)
|
||||
|
||||
; Global stuff that does not change over time and needed everywhere
|
||||
(def ^:dynamic *base-url* "http://www.ludumdare.com/compo/")
|
||||
(def ^:dynamic *image-base-path* "content/images")
|
||||
(def ^:dynamic *image-base-path* "resources/public/img/ld")
|
||||
|
||||
(def ^:dynamic *competition* 27)
|
||||
|
|
Loading…
Reference in a new issue