Skip to content

Instantly share code, notes, and snippets.

@redraiment
Created September 4, 2025 02:03
Show Gist options
  • Select an option

  • Save redraiment/4f5c9051dd99c9b0041916cc2f34d3ce to your computer and use it in GitHub Desktop.

Select an option

Save redraiment/4f5c9051dd99c9b0041916cc2f34d3ce to your computer and use it in GitHub Desktop.
Convert worksheet XML file in .xlsx to CSV file
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
typedef char* String;
typedef int (*Predicate)(int);
void help(String);
int min(int, int);
int max(int, int);
int take_with(String, Predicate);
int col_to_index(String, int);
int row_to_index(String, int);
int string_to_dec(String, int);
int string_to_hex(String, int);
/* Buffer */
typedef struct _Buffer {
String data;
int capacity;
int length;
bool escaped;
} *Buffer;
Buffer buffer_new() {
Buffer this = (Buffer)calloc(sizeof(struct _Buffer), 1);
this->capacity = 1024;
this->data = (String)calloc(sizeof(char), this->capacity);
this->data[0]= '"';
this->length = 1;
this->escaped = false;
return this;
}
void buffer_delete(Buffer this) {
if (this->data != NULL) {
free(this->data);
}
free(this);
}
int buffer_length(Buffer this) {
return this->length - (this->escaped ? 0 : 1);
}
void buffer_append(Buffer this, char value) {
if (this->length + 4 >= this->capacity) {
this->capacity *= 2;
String data = realloc(this->data, sizeof(char) * this->capacity);
if (data == NULL) {
fprintf(stderr, "buffer alloc memory failed\n");
exit(EXIT_FAILURE);
}
this->data = data;
}
this->data[this->length] = value;
this->length++;
}
void buffer_push(Buffer this, char value) {
if (!this->escaped && strchr(",\"\r\n", value) != NULL) {
this->escaped = true;
}
buffer_append(this, value);
if (value == '"') {
buffer_append(this, value);
}
}
void buffer_push_wchar(Buffer this, int codepoint) {
if (codepoint <= 0x7F) { // 1字节UTF-8
buffer_push(this, (char)codepoint);
} else if (codepoint <= 0x7FF) { // 2字节UTF-8
buffer_push(this, 0xC0 | (codepoint >> 6));
buffer_push(this, 0x80 | (codepoint & 0x3F));
} else if (codepoint <= 0xFFFF) { // 3字节UTF-8
buffer_push(this, 0xE0 | (codepoint >> 12));
buffer_push(this, 0x80 | ((codepoint >> 6) & 0x3F));
buffer_push(this, 0x80 | (codepoint & 0x3F));
} else if (codepoint <= 0x10FFFF) { // 4字节UTF-8
buffer_push(this, 0xF0 | (codepoint >> 18));
buffer_push(this, 0x80 | ((codepoint >> 12) & 0x3F));
buffer_push(this, 0x80 | ((codepoint >> 6) & 0x3F));
buffer_push(this, 0x80 | (codepoint & 0x3F));
}
}
void buffer_clear(Buffer this) {
this->length = 1;
this->escaped = false;
}
/* Range */
typedef struct _Range {
int row_lower_bound;
int row_upper_bound;
int col_lower_bound;
int col_upper_bound;
} *Range;
Range range_new(String range) {
Range this = (Range)calloc(sizeof(struct _Range), 1);
int offset = 0;
int length = take_with(range + offset, isupper);
this->col_lower_bound = col_to_index(range + offset, length);
offset += length;
length = take_with(range + offset, isdigit);
this->row_lower_bound = row_to_index(range + offset, length);
offset += length;
if (range[offset] == ':') {
offset++;
}
length = take_with(range + offset, isupper);
this->col_upper_bound = col_to_index(range + offset, length);
offset += length;
length = take_with(range + offset, isdigit);
this->row_upper_bound = row_to_index(range + offset, length);
offset += length;
if (range[offset]
|| this->col_lower_bound < 0
|| this->row_lower_bound < 0
|| this->col_upper_bound < 0
|| this->row_upper_bound < 0
) {
fprintf(stderr, "parse range '%s' to '<COL><ROW>:<COL><ROW>' failed\n", range);
exit(EXIT_FAILURE);
}
return this;
}
void range_delete(Range this) {
free(this);
}
char range_delimiter(Range this, int col) {
return this->col_upper_bound == col ? '\n' : ',';
}
/* Reader */
typedef int Descriptor;
typedef size_t Size;
typedef struct _Reader {
Descriptor file;
Size size;
String content;
} *Reader;
Reader reader_new(String file_name) {
Reader this = (Reader)calloc(sizeof(struct _Reader), 1);
// 打开文件句柄。
this->file = open(file_name, O_RDONLY);
if (this->file < 0) {
fprintf(stderr, "open %s failed\n", file_name);
exit(EXIT_FAILURE);
}
// 统计文件的字符数。
struct stat status;
if (fstat(this->file, &status) < 0) {
close(this->file);
fprintf(stderr, "count %s failed\n", file_name);
exit(EXIT_FAILURE);
}
this->size = status.st_size;
// 把文件内容映射到内存。
this->content = mmap(NULL, this->size, PROT_READ, MAP_PRIVATE, this->file, 0);
if (this->content == MAP_FAILED) {
close(this->file);
fprintf(stderr, "mmap %s failed\n", file_name);
exit(EXIT_FAILURE);
}
return this;
}
void reader_delete(Reader this) {
if (this->content != NULL) {
munmap(this->content, this->size);
}
if (this->file > 0) {
close(this->file);
}
free(this);
}
/* Writer */
typedef FILE *Writer;
Writer writer_new(String file_name) {
Writer this = fopen(file_name, "w");
if (this == NULL) {
fprintf(stderr, "open %s failed\n", file_name);
exit(EXIT_FAILURE);
}
return this;
}
void writer_delete(Writer this) {
fclose(this);
}
void writer_write_string(Writer this, String data, int length) {
if (length > 0) {
fwrite(data, sizeof(char), length, this);
}
}
void writer_write_buffer(Writer this, Buffer buffer, char delimiter) {
if (buffer->escaped) {
buffer_append(buffer, '"');
}
buffer_append(buffer, delimiter);
if (buffer->escaped) {
writer_write_string(this, buffer->data, buffer->length);
} else {
writer_write_string(this, buffer->data + 1, buffer->length - 1);
}
buffer_clear(buffer);
}
/* Transform */
String read_text(String pointer, String delimiter, Buffer buffer) {
size_t length = strlen(delimiter);
bool ignore = false;
char value = 0;
while ((value = *pointer) != 0) {
if (!strncmp(pointer, delimiter, length)) {
pointer += length;
break;
} else if (value == '<') {
ignore = true;
} else if (value == '>') {
ignore = false;
} else if (!ignore) {
if (!strncmp(pointer, "&lt;", 4)) {
buffer_push(buffer, '<');
pointer += 3;
} else if (!strncmp(pointer, "&gt;", 4)) {
buffer_push(buffer, '>');
pointer += 3;
} else if (!strncmp(pointer, "&amp;", 5)) {
buffer_push(buffer, '&');
pointer += 4;
} else if (!strncmp(pointer, "&apos;", 5)) {
buffer_push(buffer, '\'');
pointer += 4;
} else if (!strncmp(pointer, "&quot;", 5)) {
buffer_push(buffer, '"');
pointer += 4;
} else if (value == '#') {
pointer++;
String end = strchr(pointer, ';');
if (*pointer == 'x' || *pointer == 'X') {
pointer++;
int codepoint = string_to_hex(pointer, end - pointer);
buffer_push_wchar(buffer, codepoint);
} else {
int codepoint = string_to_dec(pointer, end - pointer);
buffer_push_wchar(buffer, codepoint);
}
pointer = end;
} else {
buffer_push(buffer, value);
}
}
pointer++;
}
return pointer;
}
void transform(Reader reader, Writer writer, Range range) {
Buffer buffer = buffer_new();
int row = range->row_lower_bound;
String pointer = reader->content;
while ((pointer = strstr(pointer, "<row ")) != NULL) {
pointer = strstr(pointer, " r=\"") + 4;
int row_index = row_to_index(pointer, take_with(pointer, isdigit));
if (row_index < range->row_lower_bound) { // 未到起始行
continue;
}
int row_upper_chunk = min(row_index, range->row_upper_bound + 1);
if (row < row_upper_chunk) {
buffer_clear(buffer);
for (int col = range->col_lower_bound; col <= range->col_upper_bound; col++) {
buffer_push(buffer, range_delimiter(range, col));
}
while (row < row_upper_chunk) { // 填充空行
writer_write_string(writer, buffer->data + 1, buffer->length - 1);
row++;
}
}
if (row_index > range->row_upper_bound) {
break;
}
int col = range->col_lower_bound;
int col_index = -1;
bool contains = false;
while ((pointer = strchr(pointer, '<')) != NULL) {
if (!strncmp(pointer, "</row>", 6)) { // 本行结束
pointer += 6;
if (col <= range->col_upper_bound) {
buffer_clear(buffer);
while (col <= range->col_upper_bound) { // 填充空列
buffer_push(buffer, range_delimiter(range, col));
col++;
}
writer_write_string(writer, buffer->data + 1, buffer->length - 1);
}
row++;
break;
} else if (contains && !strncmp(pointer, "</c> ", 4)) { // 本列结束
pointer += 4;
writer_write_buffer(writer, buffer, range_delimiter(range, col));
col++;
} else if (!strncmp(pointer, "<c ", 3)) {
pointer = strstr(pointer, " r=\"") + 4;
col_index = col_to_index(pointer, take_with(pointer, isupper));
pointer = strchr(pointer, '>');
int col_upper_chunk = min(col_index, range->col_upper_bound + 1);
if (col < col_upper_chunk) {
buffer_clear(buffer);
while (col < col_upper_chunk) { // 填充空列
buffer_push(buffer, range_delimiter(range, col));
col++;
}
writer_write_string(writer, buffer->data + 1, buffer->length - 1);
}
buffer_clear(buffer);
contains = range->col_lower_bound <= col_index && col_index <= range->col_upper_bound;
} else if (contains && !strncmp(pointer, "<v>", 3)) {
pointer = read_text(pointer, "</v>", buffer);
} else if (contains && !strncmp(pointer, "<is>", 4)) {
pointer = read_text(pointer, "</is>", buffer);
} else {
pointer = strchr(pointer, '>');
}
}
}
buffer_delete(buffer);
}
/* Main */
int main(int argc, char* argv[]) {
String program = argv[0];
Reader reader = NULL;
Writer writer = NULL;
Range range = NULL;
if (argc == 4) {
reader = reader_new(argv[1]);
range = range_new(argv[2]);
writer = writer_new(argv[3]);
} else {
help(program);
}
transform(reader, writer, range);
range_delete(range);
reader_delete(reader);
writer_delete(writer);
return EXIT_SUCCESS;
}
/* Helpers */
void help(String program) {
fprintf(stderr, "Usage: %s <XML> <RANGE> <CSV>\n\n", program);
fprintf(stderr, "PARAMETERS:\n\n");
fprintf(stderr, " XML Input xml file name (required).\n");
fprintf(stderr, " RANGE Data range (required).\n");
fprintf(stderr, " CSV Output csv file name (required).\n");
fprintf(stderr, "\nAuthor: Zhang, Zepeng <[email protected]>\n");
exit(EXIT_FAILURE);
}
int min(int a, int b) {
return a < b ? a : b;
}
int max(int a, int b) {
return a > b ? a : b;
}
int take_with(String s, Predicate predicate) {
int index = 0;
while (s != NULL && s[index] != 0 && predicate(s[index])) {
index++;
}
return index;
}
int col_to_index(String s, int length) {
int column = 0;
for (int index = 0; s != NULL && index < length; index++) {
column = column * 26 + s[index] - 64;
}
return column - 1;
}
int row_to_index(String s, int length) {
return string_to_dec(s, length) - 1;
}
int string_to_dec(String s, int length) {
int value = 0;
for (int index = 0; s != NULL && index < length; index++) {
value = value * 10 + s[index] - '0';
}
return value;
}
int string_to_hex(String s, int length) {
int value = 0;
for (int index = 0; s != NULL && index < length; index++) {
value *= 16;
if (isdigit(s[index])) {
value += s[index] - '0';
} else if (isupper(s[index])) {
value += s[index] - 'A' + 10;
}
}
return value;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment