前端實現(xiàn)CSV文件解析的方法詳解
基本規(guī)則
簡單來說就:使用逗號分隔數(shù)據(jù),當出現(xiàn)沖突的使用雙引號包裹沖突數(shù)據(jù)來解決沖突(沒有沖突也可以使用雙引號包裹數(shù)據(jù))。通過逗號將數(shù)據(jù)分隔成列,通過 \n
換行符將數(shù)據(jù)分隔成行,因此 CSV 格式可以用來表示二維表格數(shù)據(jù)。
CSV 解析
根據(jù)上面的格式,簡單寫一個 CSV 解析器。思路其實很簡單,就是先按 \n
進行分行,再按 ,
進行分列。只不過需要注意的是:在分列的時候遇到 "
要將雙引號的內(nèi)容作為一個整體。具體代碼如下:
// 特殊符號枚舉 const SIGN = { rowDelimiter: '\n', colDelimiter: ',', specialCharacter: '"', } const parseCsv = (str = '') => str.split(SIGN.rowDelimiter).map((row) => { const chunk = [] let doubleQuoteIsClose = true let unit = '' for (let i = 0; i < row.length; i++) { const s = row[i] if (s === SIGN.colDelimiter && doubleQuoteIsClose) { // 去除首尾 ",這兩個雙引號可能是由于逗號沖突而添加的 unit = unit.replace(/^"|"$/g, '') chunk.push(unit) unit = '' continue } if (s === SIGN.specialCharacter) { doubleQuoteIsClose = !doubleQuoteIsClose } unit += s } // 收集末尾的 unit if (unit) { unit = unit.replace(/^"|"$/g, '') chunk.push(unit) } return chunk })
通過上面處理,我們可以將一個 csv 格式的數(shù)據(jù)解析成一個二維數(shù)組。其實 csv 數(shù)據(jù)組織格式的核心很簡單:使用 \n
分隔行,使用 ,
分隔列,使用 "
作為特殊字符來解決字符沖突。 至于是否組裝 header,以及一些優(yōu)化處理就比較簡單了。
PapaParse 源碼分析
接下來我們來看一款成熟的工具:PapaParse。PapaParse 有豐富的使用文檔,并且在 GitHub 上有 12k 的 star,npm 的周下載量也在非常高的兩百多萬,是一款比較推薦的 csv 解析庫。
解析相關的核心源碼我放在了附錄。我把它的解析部分分成兩個部分,一種是不包含雙引號的情況,代碼如下:
// Next delimiter comes before next newline, so we've reached end of field if ( nextDelim !== -1 && (nextDelim < nextNewline || nextNewline === -1) ) { row.push(input.substring(cursor, nextDelim)) cursor = nextDelim + delimLen // we look for next delimiter char nextDelim = input.indexOf(delim, cursor) continue } // End of row if (nextNewline !== -1) { row.push(input.substring(cursor, nextNewline)) saveRow(nextNewline + newlineLen) if (stepIsFunction) { doStep() if (aborted) return returnable() } if (preview && data.length >= preview) return returnable(true) continue }
其實也就是簡單的按分隔符切割字符串。
另外一種是包含雙引號,值得一提的是:如果包含雙引號,那么開頭和結尾一定是雙引號,那么關于雙引號中間部分的解析主要需要注意的是內(nèi)部可能包含轉(zhuǎn)義字符。比如:
// If this quote is escaped, it's part of the data; skip it // If the quote character is the escape character, then check if the next character is the escape character if (quoteChar === escapeChar && input[quoteSearch + 1] === escapeChar) { quoteSearch++ continue } // If the quote character is not the escape character, then check if the previous character was the escape character if ( quoteChar !== escapeChar && quoteSearch !== 0 && input[quoteSearch - 1] === escapeChar ) { continue }
它的這個邏輯對于轉(zhuǎn)義字符的判斷更加細膩,不過我們之前的 double 應該也沒有太大的問題。
if (s === SIGN.specialCharacter) { doubleQuoteIsClose = !doubleQuoteIsClose }
所以 csv 數(shù)據(jù)格式解析的核心邏輯其實是很簡單的。
附錄(解析相關的源碼)
this.parse = function (input, baseIndex, ignoreLastRow) { // For some reason, in Chrome, this speeds things up (!?) if (typeof input !== 'string') throw new Error('Input must be a string') // We don't need to compute some of these every time parse() is called, // but having them in a more local scope seems to perform better var inputLen = input.length, delimLen = delim.length, newlineLen = newline.length, commentsLen = comments.length var stepIsFunction = isFunction(step) // Establish starting state cursor = 0 var data = [], errors = [], row = [], lastCursor = 0 if (!input) return returnable() // Rename headers if there are duplicates var firstLine if (config.header && !baseIndex) { firstLine = input.split(newline)[0] var headers = firstLine.split(delim) var separator = '_' var headerMap = new Set() var headerCount = {} var duplicateHeaders = false // Using old-style 'for' loop to avoid prototype pollution that would be picked up with 'var j in headers' for (var j = 0; j < headers.length; j++) { var header = headers[j] if (isFunction(config.transformHeader)) header = config.transformHeader(header, j) var headerName = header var count = headerCount[header] || 0 if (count > 0) { duplicateHeaders = true headerName = header + separator + count // Initialise the variable if it hasn't been. if (renamedHeaders === null) { renamedHeaders = {} } } headerCount[header] = count + 1 // In case it already exists, we add more separators while (headerMap.has(headerName)) { headerName = headerName + separator + count } headerMap.add(headerName) if (count > 0) { renamedHeaders[headerName] = header } } if (duplicateHeaders) { var editedInput = input.split(newline) editedInput[0] = Array.from(headerMap).join(delim) input = editedInput.join(newline) } } if (fastMode || (fastMode !== false && input.indexOf(quoteChar) === -1)) { var rows = input.split(newline) for (var i = 0; i < rows.length; i++) { row = rows[i] // use firstline as row length may be changed due to duplicated headers if (i === 0 && firstLine !== undefined) { cursor += firstLine.length } else { cursor += row.length } if (i !== rows.length - 1) cursor += newline.length else if (ignoreLastRow) return returnable() if (comments && row.substring(0, commentsLen) === comments) continue if (stepIsFunction) { data = [] pushRow(row.split(delim)) doStep() if (aborted) return returnable() } else pushRow(row.split(delim)) if (preview && i >= preview) { data = data.slice(0, preview) return returnable(true) } } return returnable() } var nextDelim = input.indexOf(delim, cursor) var nextNewline = input.indexOf(newline, cursor) var quoteCharRegex = new RegExp( escapeRegExp(escapeChar) + escapeRegExp(quoteChar), 'g' ) var quoteSearch = input.indexOf(quoteChar, cursor) // Parser loop for (;;) { // Field has opening quote if (input[cursor] === quoteChar) { // Start our search for the closing quote where the cursor is quoteSearch = cursor // Skip the opening quote cursor++ for (;;) { // Find closing quote quoteSearch = input.indexOf(quoteChar, quoteSearch + 1) //No other quotes are found - no other delimiters if (quoteSearch === -1) { if (!ignoreLastRow) { // No closing quote... what a pity errors.push({ type: 'Quotes', code: 'MissingQuotes', message: 'Quoted field unterminated', row: data.length, // row has yet to be inserted index: cursor, }) } return finish() } // Closing quote at EOF if (quoteSearch === inputLen - 1) { var value = input .substring(cursor, quoteSearch) .replace(quoteCharRegex, quoteChar) return finish(value) } // If this quote is escaped, it's part of the data; skip it // If the quote character is the escape character, then check if the next character is the escape character // 連續(xù)兩個雙引號,表示轉(zhuǎn)譯的意思 if (quoteChar === escapeChar && input[quoteSearch + 1] === escapeChar) { quoteSearch++ continue } // If the quote character is not the escape character, then check if the previous character was the escape character if ( quoteChar !== escapeChar && quoteSearch !== 0 && input[quoteSearch - 1] === escapeChar ) { continue } // 說明匹配到 " 結束符號 if (nextDelim !== -1 && nextDelim < quoteSearch + 1) { nextDelim = input.indexOf(delim, quoteSearch + 1) } if (nextNewline !== -1 && nextNewline < quoteSearch + 1) { nextNewline = input.indexOf(newline, quoteSearch + 1) } // Check up to nextDelim or nextNewline, whichever is closest var checkUpTo = nextNewline === -1 ? nextDelim : Math.min(nextDelim, nextNewline) var spacesBetweenQuoteAndDelimiter = extraSpaces(checkUpTo) // Closing quote followed by delimiter or 'unnecessary spaces + delimiter' // 跳過空格 if ( input.substr( quoteSearch + 1 + spacesBetweenQuoteAndDelimiter, delimLen ) === delim ) { row.push( input .substring(cursor, quoteSearch) .replace(quoteCharRegex, quoteChar) ) cursor = quoteSearch + 1 + spacesBetweenQuoteAndDelimiter + delimLen // If char after following delimiter is not quoteChar, we find next quote char position if ( input[ quoteSearch + 1 + spacesBetweenQuoteAndDelimiter + delimLen ] !== quoteChar ) { quoteSearch = input.indexOf(quoteChar, cursor) } nextDelim = input.indexOf(delim, cursor) nextNewline = input.indexOf(newline, cursor) break } var spacesBetweenQuoteAndNewLine = extraSpaces(nextNewline) // Closing quote followed by newline or 'unnecessary spaces + newLine' if ( input.substring( quoteSearch + 1 + spacesBetweenQuoteAndNewLine, quoteSearch + 1 + spacesBetweenQuoteAndNewLine + newlineLen ) === newline ) { row.push( input .substring(cursor, quoteSearch) .replace(quoteCharRegex, quoteChar) ) saveRow(quoteSearch + 1 + spacesBetweenQuoteAndNewLine + newlineLen) nextDelim = input.indexOf(delim, cursor) // because we may have skipped the nextDelim in the quoted field quoteSearch = input.indexOf(quoteChar, cursor) // we search for first quote in next line if (stepIsFunction) { doStep() if (aborted) return returnable() } if (preview && data.length >= preview) return returnable(true) break } // Checks for valid closing quotes are complete (escaped quotes or quote followed by EOF/delimiter/newline) -- assume these quotes are part of an invalid text string errors.push({ type: 'Quotes', code: 'InvalidQuotes', message: 'Trailing quote on quoted field is malformed', row: data.length, // row has yet to be inserted index: cursor, }) quoteSearch++ continue } continue } // Comment found at start of new line if ( comments && row.length === 0 && input.substring(cursor, cursor + commentsLen) === comments ) { if (nextNewline === -1) // Comment ends at EOF return returnable() cursor = nextNewline + newlineLen nextNewline = input.indexOf(newline, cursor) nextDelim = input.indexOf(delim, cursor) continue } // Next delimiter comes before next newline, so we've reached end of field if (nextDelim !== -1 && (nextDelim < nextNewline || nextNewline === -1)) { row.push(input.substring(cursor, nextDelim)) cursor = nextDelim + delimLen // we look for next delimiter char nextDelim = input.indexOf(delim, cursor) continue } // End of row if (nextNewline !== -1) { row.push(input.substring(cursor, nextNewline)) saveRow(nextNewline + newlineLen) if (stepIsFunction) { doStep() if (aborted) return returnable() } if (preview && data.length >= preview) return returnable(true) continue } break } return finish() function pushRow(row) { data.push(row) lastCursor = cursor } /** * checks if there are extra spaces after closing quote and given index without any text * if Yes, returns the number of spaces */ function extraSpaces(index) { var spaceLength = 0 if (index !== -1) { var textBetweenClosingQuoteAndIndex = input.substring( quoteSearch + 1, index ) if ( textBetweenClosingQuoteAndIndex && textBetweenClosingQuoteAndIndex.trim() === '' ) { spaceLength = textBetweenClosingQuoteAndIndex.length } } return spaceLength } /** * Appends the remaining input from cursor to the end into * row, saves the row, calls step, and returns the results. */ function finish(value) { if (ignoreLastRow) return returnable() if (typeof value === 'undefined') value = input.substring(cursor) row.push(value) cursor = inputLen // important in case parsing is paused pushRow(row) if (stepIsFunction) doStep() return returnable() } /** * Appends the current row to the results. It sets the cursor * to newCursor and finds the nextNewline. The caller should * take care to execute user's step function and check for * preview and end parsing if necessary. */ function saveRow(newCursor) { cursor = newCursor pushRow(row) row = [] nextNewline = input.indexOf(newline, cursor) } /** Returns an object with the results, errors, and meta. */ function returnable(stopped) { return { data: data, errors: errors, meta: { delimiter: delim, linebreak: newline, aborted: aborted, truncated: !!stopped, cursor: lastCursor + (baseIndex || 0), renamedHeaders: renamedHeaders, }, } } /** Executes the user's step function and resets data & errors. */ function doStep() { step(returnable()) data = [] errors = [] } }
到此這篇關于前端實現(xiàn)CSV文件解析的方法詳解的文章就介紹到這了,更多相關CSV文件解析內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關文章希望大家以后多多支持腳本之家!
相關文章
JavaScript變量類型以及變量之間的轉(zhuǎn)換你了解嗎
這篇文章主要為大家詳細介紹了JavaScript變量類型以及變量之間的轉(zhuǎn)換,文中示例代碼介紹的非常詳細,具有一定的參考價值,感興趣的小伙伴們可以參考一下,希望能夠給你帶來幫助2022-02-02JavaScript拆分字符串時產(chǎn)生空字符的解決方案
使用JavaScript的split方法拆分字符串時出現(xiàn)一些空字符串"",尤其是當使用正則表達式作為分隔符的時候。那么,產(chǎn)生這些空字符串的原因是什么?又該如何來處理呢,這就是今天我們要探討的問題2014-09-09JavaScript極簡入門教程(二):對象和函數(shù)
這篇文章主要介紹了JavaScript極簡入門教程(二):對象和函數(shù),本文講解了對象基礎知識、函數(shù)基礎知識、函數(shù)調(diào)用、異常、繼承等內(nèi)容,需要的朋友可以參考下2014-10-10