Lisp HUG Maillist Archive

Re: CSV Files

BTW... for anyone interested, I have a very fast CSV File reader/parser in Lisp, that I will pass along. I rewrote an older version of same a few weeks ago, but this time, I made extensive use of caching, and perform as few computations as possible until demanded by the calling application. 

The result is very very fast, uses little or no copying, except when necessary for safety sake (as when allowing a user specified function access to the embedded string in a line read from a file).

In all, about 250 lines of Lisp code. not much...

Aww heck....

------------------------------------
;; fast-csv.lisp -- Fast, Simple, CSV File Reading
;;
;; This version is very fast because it does the minimum possible work
;; in reading CSV files, and extracting information on demand.
;;
;; It is assumed that no fields are quoted fields that contain the primary delimiter.
;; Primary delimiters (comma for CSV, tab for TSV files) are heuristically determined
;; from examination of the first line of the file, which is assumed to be the headings
;; line. Subsequent lines are assumed to contain data. Empty lines are discarded.
;;
;; These conditions are satisfied by nearly 99%+ of all the CSV files ever seen.
;; If your CSV/TSV file violates any of these conditions, then use the more elaborate
;; and general-purpose routines in CSV.LISP.
;;
;; DM/RAL  06/07
;; ----------------------------------------------------------------------------

(defpackage fast-csv
  (:use common-lisp)
  (:nicknames fcsv)
  (:export
   read-file
   get-column
   get-numeric-column
   get-field
   get-numeric-field))

(in-package fast-csv)

(defun collect-delimiter-positions (s delim &key count (start 0))
  (loop with  pos = start
        with  end = start
        while (and end
                   (or (null count)
                       (plusp count)))
        do    (setq end (position delim s :start pos))
        collect (or end (length s))
        when  end
        do    (setq pos (1+ end))
        when  count
        do    (decf count)
        ))

(defun collect-headers (s delim)
  (let ((pos (collect-delimiter-positions s delim)))
    (loop with start = 0
          for  p in pos
          collect (subseq s start p)
          do      (setq start (1+ p))
          )))

(defun read-lines (&optional fname)
  ;;
  ;; Return a list of text lines from the file
  ;;
  (lw:when-let (fnm (or fname
             (capi:prompt-for-file
              "Select File to Read"
              :filters '("CSV Files" "*.csv;*.tsv"
                                             "All Files" "*.*"))
             ))
    (with-open-file (fp fnm)
      (loop for line = (read-line fp nil :eof)
            until (eq line :eof)
            while line
            when  (plusp (length line)) ;; elide empty lines
            collect line))
    ))

(defun decide-csv-or-tsv (line)
  ;;
  ;; count the number of commas and tabs in the list of lines.
  ;; If commas outnumber tabs then we probably have CSV. Else we
  ;; probably have TSV.
  ;;
  (let ((n-comma (count-if (um:curry #'char= #\,)   line))
(n-tab   (count-if (um:curry #'char= #\Tab) line)))
    (if (> n-comma n-tab)
        #\,
      #\Tab)
    ))

(defstruct csv-info
  hdrs
  delim
  row-infos
  nlines
  lines
  ends-cache)

(defun collect-additional-ends (info column row line)
  (with-accessors ((ends-cache  csv-info-ends-cache)
                   (row-infos   csv-info-row-infos )
                   (delim       csv-info-delim     )) info
    (let* ((ends     (aref row-infos row))
           (new-ends (if ends
                         ;; we have some partial information
                         (let* ((nends     (length ends))
                                (more-ends (collect-delimiter-positions
                                            line delim
                                            :count (1+ (- column nends))
                                            :start (1+ (aref ends (1- nends)))
                                            )))
                           (concatenate 'vector ends more-ends))
                      
                       ;; else we never had any information -- build from scratch
                       (coerce
                        (collect-delimiter-positions
                         line delim
                         :count (1+ column)
                         :start 0)
                        'vector)))
          
           (unique-ends (gethash new-ends ends-cache nil)))
      (unless unique-ends
        (setf (gethash new-ends ends-cache) new-ends))
      (setf (aref row-infos row) (or unique-ends new-ends))
      )))

(defun ensure-column-of-row-exists (info column row line)
  ;; at this point row-info cannot be nil, but its contents might be...
  (with-accessors ((row-infos  csv-info-row-infos)) info
    (let ((ends (aref row-infos row)))
      (unless (< column (length ends))
        (collect-additional-ends info column row line))
      )))
         
(defun extract-data-of-column (info column reverse op)
  (with-accessors ((nlines    csv-info-nlines)
                   (lines     csv-info-lines)
                   (row-infos csv-info-row-infos)) info
    (let* ((nelm1 (1- nlines))
           (data  (make-array nlines)))
      (loop for line in lines
            for row from 0
            for revrow = (- nelm1 row)
            do
            (ensure-column-of-row-exists info column row line)
            ;; at this point ends is a vector of delimiter positions
            ;; that is at least as long as one greater than the column index
            (let* ((ends  (aref row-infos row))
                   (start (if (zerop column)
                              0
                            (1+ (aref ends (1- column)))
                            ))
                   (end   (aref ends column))
                   (datum (funcall op line start end)))
              (setf (aref data (if reverse revrow row)) datum)
              ))
      data
      )))

(defun get-column-of-hdr (hdr info)
  (position (um:mkstr hdr) (csv-info-hdrs info)
            :test #'string-equal))

(defun ensure-row-infos-exist (info column all)
  (with-accessors ((row-infos  csv-info-row-infos)
                   (nlines     csv-info-nlines   )
                   (lines      csv-info-lines    )) info
    (unless row-infos
      (setf row-infos (make-array nlines))
      (when all
        (loop for row from 0
              for line in lines
              do
              (collect-additional-ends info column row line)))
      )))

(defun get-string-field-op (op)
  (if op
      (um:compose op #'subseq)
    #'subseq))

(defun get-numeric-field-op (scrubber op)
  (let ((pre-op
         (if scrubber
             ;; we need to extract a subseq for the user's function
             (lambda (line start end)
               (read-from-string
                (funcall scrubber (subseq line start end))
                nil 0))
          
           ;; else we can avoid creating a subseq
           (lambda (line start end)
             (read-from-string line nil 0
                               :start start :end end)))
         ))
    (if op
        (um:compose op pre-op)
      pre-op)
    ))

;; --------------------------------------------------------------
;; user accessible routines...
;;
(defun read-file (fname &key (ndrop 0))
  (let* ((lines      (nthcdr ndrop (read-lines fname)))
         (delim      (decide-csv-or-tsv (first lines)))
         (hdrs       (collect-headers (first lines) delim))
         (data-lines (rest lines)))
    (make-csv-info
     :hdrs       hdrs
     :delim      delim
     :row-infos  nil ;; a vector of vectors of delimiter positions
     :nlines     (length data-lines)
     :lines      data-lines
     :ends-cache (make-hash-table :test #'equalp)
     )))
        
(defun get-column (hdr info &key reverse op num-op)
  ;; user should utilize keys :reverse and :op
  ;; the :num-op key is for our private use
  (let* ((column (get-column-of-hdr hdr info)))
    (when column
      (ensure-row-infos-exist info column :all)
      (extract-data-of-column info column reverse
                              (or num-op
                                  (get-string-field-op op)))
      )))

(defun get-numeric-column (hdr info &key scrubber reverse op)
  (get-column hdr info
              :reverse reverse
              :num-op  (get-numeric-field-op scrubber op)))


(defun get-field (hdr row info &key op num-op reverse)
  ;; user should utilize keys :reverse and :op
  ;; the :num-op key is for our private use
  (with-accessors ((lines     csv-info-lines    )
                   (row-infos csv-info-row-infos)
                   (nlines    csv-info-nlines   )) info
    (let ((column (get-column-of-hdr hdr info)))
      (when column
        (ensure-row-infos-exist info column nil)
        (let* ((actual-row (cond ((numberp row) (if reverse
                                                    (- nlines row 1)
                                                  row))
                                
                                 ((eq row :last) (if reverse
                                                     0
                                                   (1- nlines)))
                                
                                 ((eq row :first) (if reverse
                                                      (1- nlines)
                                                    0))
                                 ))
               (line (nth actual-row lines)))
          (ensure-column-of-row-exists info column actual-row line)
          (let* ((ends  (aref row-infos actual-row))
                 (start (if (zerop column)
                            0
                          (1+ (aref ends (1- column)))
                          ))
                 (end   (aref ends column)))
            (funcall (or num-op
                         (get-string-field-op op))
                     line start end)
            ))
        ))
    ))

(defun get-numeric-field (hdr row info &key scrubber op reverse)
  (get-field hdr row info
             :num-op (get-numeric-field-op scrubber op)
             :reverse reverse))

-----------------------------------------

David McClain
Chief Technical Officer
Refined Audiometrics Laboratory
4391 N. Camino Ferreo
Tucson, AZ  85750

email: dbm@refined-audiometrics.com
phone: 1.520.390.3995
web: http://www.refined-audiometrics.com
Skype: dbmcclain


Updated at: 2020-12-10 08:45 UTC