· clojure

Clojure: Reading and writing a reasonably sized file


(defn get-pixels [pix] (map #( Integer/parseInt %) pix))

(defn create-tuple [[ head & rem]] {:pixels (get-pixels rem) :label head})

(defn tuples [rows] (map create-tuple rows))

(defn parse-row [row] (map #(clojure.string/split % #",") row))

(defn read-raw [path n] 
  (with-open [reader (clojure.java.io/reader path)] (vec (take n (rest  (line-seq reader))))))

(def read-train-set-raw  (partial read-raw "data/train.csv"))

(def parsed-rows (tuples (parse-row (read-train-set-raw 42000))))

(def dead-to-us-pixels
  [0 1 2 3 4 5 6 7 8 9 10 11 16 17 18 19 20  21  22  23  24  25  26  27 28 29  30 31 52 53 54 55 56 57 82 83 84 85 111 112 139 140 141 168 196 392 420 421 448 476 532 560 644 645 671 672 673 699 700 701 727 728 729 730 731 754 755 756 757 758 759 760 780 781 782 783])

(defn in? 
  "true if seq contains elm"
  [seq elm]  
  (some #(= elm %) seq))

(defn dead-to-us? [pixel-with-index]
  (in? dead-to-us-pixels (first pixel-with-index)))

(defn remove-unwanted-pixels [row]
  (let [new-pixels
        (->> row :pixels (map-indexed vector) (remove dead-to-us?) (map second))]
    {:pixels new-pixels :label (:label row)}))

(defn -main []
  (with-open [wrt (clojure.java.io/writer "/tmp/attempt-1.txt")]
    (doseq [line parsed-rows]
      (let [line-without-pixels (to-file-format (remove-unwanted-pixels line))]
        (.write wrt (str line-without-pixels "\n"))))))

(defn split-on-comma [line]
  (string/split line #","))

(defn clean-train-file []
  (with-open [rdr (clojure.java.io/reader "data/train.csv")
              wrt (clojure.java.io/writer "/tmp/attempt-2.csv")]
    (doseq [line (drop 1 (line-seq rdr))]
      (let [line-with-removed-pixels
             ((comp to-file-format remove-unwanted-pixels create-tuple split-on-comma) line)]
        (.write wrt (str line-with-removed-pixels "\n"))))))

(defn -main [] (clean-train-file))
  • LinkedIn
  • Tumblr
  • Reddit
  • Google+
  • Pinterest
  • Pocket