Created
September 4, 2025 02:03
-
-
Save redraiment/4f5c9051dd99c9b0041916cc2f34d3ce to your computer and use it in GitHub Desktop.
Convert worksheet XML file in .xlsx to CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <ctype.h> | |
| #include <stdbool.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <fcntl.h> | |
| #include <sys/mman.h> | |
| #include <sys/stat.h> | |
| #include <unistd.h> | |
| typedef char* String; | |
| typedef int (*Predicate)(int); | |
| void help(String); | |
| int min(int, int); | |
| int max(int, int); | |
| int take_with(String, Predicate); | |
| int col_to_index(String, int); | |
| int row_to_index(String, int); | |
| int string_to_dec(String, int); | |
| int string_to_hex(String, int); | |
| /* Buffer */ | |
| typedef struct _Buffer { | |
| String data; | |
| int capacity; | |
| int length; | |
| bool escaped; | |
| } *Buffer; | |
| Buffer buffer_new() { | |
| Buffer this = (Buffer)calloc(sizeof(struct _Buffer), 1); | |
| this->capacity = 1024; | |
| this->data = (String)calloc(sizeof(char), this->capacity); | |
| this->data[0]= '"'; | |
| this->length = 1; | |
| this->escaped = false; | |
| return this; | |
| } | |
| void buffer_delete(Buffer this) { | |
| if (this->data != NULL) { | |
| free(this->data); | |
| } | |
| free(this); | |
| } | |
| int buffer_length(Buffer this) { | |
| return this->length - (this->escaped ? 0 : 1); | |
| } | |
| void buffer_append(Buffer this, char value) { | |
| if (this->length + 4 >= this->capacity) { | |
| this->capacity *= 2; | |
| String data = realloc(this->data, sizeof(char) * this->capacity); | |
| if (data == NULL) { | |
| fprintf(stderr, "buffer alloc memory failed\n"); | |
| exit(EXIT_FAILURE); | |
| } | |
| this->data = data; | |
| } | |
| this->data[this->length] = value; | |
| this->length++; | |
| } | |
| void buffer_push(Buffer this, char value) { | |
| if (!this->escaped && strchr(",\"\r\n", value) != NULL) { | |
| this->escaped = true; | |
| } | |
| buffer_append(this, value); | |
| if (value == '"') { | |
| buffer_append(this, value); | |
| } | |
| } | |
| void buffer_push_wchar(Buffer this, int codepoint) { | |
| if (codepoint <= 0x7F) { // 1字节UTF-8 | |
| buffer_push(this, (char)codepoint); | |
| } else if (codepoint <= 0x7FF) { // 2字节UTF-8 | |
| buffer_push(this, 0xC0 | (codepoint >> 6)); | |
| buffer_push(this, 0x80 | (codepoint & 0x3F)); | |
| } else if (codepoint <= 0xFFFF) { // 3字节UTF-8 | |
| buffer_push(this, 0xE0 | (codepoint >> 12)); | |
| buffer_push(this, 0x80 | ((codepoint >> 6) & 0x3F)); | |
| buffer_push(this, 0x80 | (codepoint & 0x3F)); | |
| } else if (codepoint <= 0x10FFFF) { // 4字节UTF-8 | |
| buffer_push(this, 0xF0 | (codepoint >> 18)); | |
| buffer_push(this, 0x80 | ((codepoint >> 12) & 0x3F)); | |
| buffer_push(this, 0x80 | ((codepoint >> 6) & 0x3F)); | |
| buffer_push(this, 0x80 | (codepoint & 0x3F)); | |
| } | |
| } | |
| void buffer_clear(Buffer this) { | |
| this->length = 1; | |
| this->escaped = false; | |
| } | |
| /* Range */ | |
| typedef struct _Range { | |
| int row_lower_bound; | |
| int row_upper_bound; | |
| int col_lower_bound; | |
| int col_upper_bound; | |
| } *Range; | |
| Range range_new(String range) { | |
| Range this = (Range)calloc(sizeof(struct _Range), 1); | |
| int offset = 0; | |
| int length = take_with(range + offset, isupper); | |
| this->col_lower_bound = col_to_index(range + offset, length); | |
| offset += length; | |
| length = take_with(range + offset, isdigit); | |
| this->row_lower_bound = row_to_index(range + offset, length); | |
| offset += length; | |
| if (range[offset] == ':') { | |
| offset++; | |
| } | |
| length = take_with(range + offset, isupper); | |
| this->col_upper_bound = col_to_index(range + offset, length); | |
| offset += length; | |
| length = take_with(range + offset, isdigit); | |
| this->row_upper_bound = row_to_index(range + offset, length); | |
| offset += length; | |
| if (range[offset] | |
| || this->col_lower_bound < 0 | |
| || this->row_lower_bound < 0 | |
| || this->col_upper_bound < 0 | |
| || this->row_upper_bound < 0 | |
| ) { | |
| fprintf(stderr, "parse range '%s' to '<COL><ROW>:<COL><ROW>' failed\n", range); | |
| exit(EXIT_FAILURE); | |
| } | |
| return this; | |
| } | |
| void range_delete(Range this) { | |
| free(this); | |
| } | |
| char range_delimiter(Range this, int col) { | |
| return this->col_upper_bound == col ? '\n' : ','; | |
| } | |
| /* Reader */ | |
| typedef int Descriptor; | |
| typedef size_t Size; | |
| typedef struct _Reader { | |
| Descriptor file; | |
| Size size; | |
| String content; | |
| } *Reader; | |
| Reader reader_new(String file_name) { | |
| Reader this = (Reader)calloc(sizeof(struct _Reader), 1); | |
| // 打开文件句柄。 | |
| this->file = open(file_name, O_RDONLY); | |
| if (this->file < 0) { | |
| fprintf(stderr, "open %s failed\n", file_name); | |
| exit(EXIT_FAILURE); | |
| } | |
| // 统计文件的字符数。 | |
| struct stat status; | |
| if (fstat(this->file, &status) < 0) { | |
| close(this->file); | |
| fprintf(stderr, "count %s failed\n", file_name); | |
| exit(EXIT_FAILURE); | |
| } | |
| this->size = status.st_size; | |
| // 把文件内容映射到内存。 | |
| this->content = mmap(NULL, this->size, PROT_READ, MAP_PRIVATE, this->file, 0); | |
| if (this->content == MAP_FAILED) { | |
| close(this->file); | |
| fprintf(stderr, "mmap %s failed\n", file_name); | |
| exit(EXIT_FAILURE); | |
| } | |
| return this; | |
| } | |
| void reader_delete(Reader this) { | |
| if (this->content != NULL) { | |
| munmap(this->content, this->size); | |
| } | |
| if (this->file > 0) { | |
| close(this->file); | |
| } | |
| free(this); | |
| } | |
| /* Writer */ | |
| typedef FILE *Writer; | |
| Writer writer_new(String file_name) { | |
| Writer this = fopen(file_name, "w"); | |
| if (this == NULL) { | |
| fprintf(stderr, "open %s failed\n", file_name); | |
| exit(EXIT_FAILURE); | |
| } | |
| return this; | |
| } | |
| void writer_delete(Writer this) { | |
| fclose(this); | |
| } | |
| void writer_write_string(Writer this, String data, int length) { | |
| if (length > 0) { | |
| fwrite(data, sizeof(char), length, this); | |
| } | |
| } | |
| void writer_write_buffer(Writer this, Buffer buffer, char delimiter) { | |
| if (buffer->escaped) { | |
| buffer_append(buffer, '"'); | |
| } | |
| buffer_append(buffer, delimiter); | |
| if (buffer->escaped) { | |
| writer_write_string(this, buffer->data, buffer->length); | |
| } else { | |
| writer_write_string(this, buffer->data + 1, buffer->length - 1); | |
| } | |
| buffer_clear(buffer); | |
| } | |
| /* Transform */ | |
| String read_text(String pointer, String delimiter, Buffer buffer) { | |
| size_t length = strlen(delimiter); | |
| bool ignore = false; | |
| char value = 0; | |
| while ((value = *pointer) != 0) { | |
| if (!strncmp(pointer, delimiter, length)) { | |
| pointer += length; | |
| break; | |
| } else if (value == '<') { | |
| ignore = true; | |
| } else if (value == '>') { | |
| ignore = false; | |
| } else if (!ignore) { | |
| if (!strncmp(pointer, "<", 4)) { | |
| buffer_push(buffer, '<'); | |
| pointer += 3; | |
| } else if (!strncmp(pointer, ">", 4)) { | |
| buffer_push(buffer, '>'); | |
| pointer += 3; | |
| } else if (!strncmp(pointer, "&", 5)) { | |
| buffer_push(buffer, '&'); | |
| pointer += 4; | |
| } else if (!strncmp(pointer, "'", 5)) { | |
| buffer_push(buffer, '\''); | |
| pointer += 4; | |
| } else if (!strncmp(pointer, """, 5)) { | |
| buffer_push(buffer, '"'); | |
| pointer += 4; | |
| } else if (value == '#') { | |
| pointer++; | |
| String end = strchr(pointer, ';'); | |
| if (*pointer == 'x' || *pointer == 'X') { | |
| pointer++; | |
| int codepoint = string_to_hex(pointer, end - pointer); | |
| buffer_push_wchar(buffer, codepoint); | |
| } else { | |
| int codepoint = string_to_dec(pointer, end - pointer); | |
| buffer_push_wchar(buffer, codepoint); | |
| } | |
| pointer = end; | |
| } else { | |
| buffer_push(buffer, value); | |
| } | |
| } | |
| pointer++; | |
| } | |
| return pointer; | |
| } | |
| void transform(Reader reader, Writer writer, Range range) { | |
| Buffer buffer = buffer_new(); | |
| int row = range->row_lower_bound; | |
| String pointer = reader->content; | |
| while ((pointer = strstr(pointer, "<row ")) != NULL) { | |
| pointer = strstr(pointer, " r=\"") + 4; | |
| int row_index = row_to_index(pointer, take_with(pointer, isdigit)); | |
| if (row_index < range->row_lower_bound) { // 未到起始行 | |
| continue; | |
| } | |
| int row_upper_chunk = min(row_index, range->row_upper_bound + 1); | |
| if (row < row_upper_chunk) { | |
| buffer_clear(buffer); | |
| for (int col = range->col_lower_bound; col <= range->col_upper_bound; col++) { | |
| buffer_push(buffer, range_delimiter(range, col)); | |
| } | |
| while (row < row_upper_chunk) { // 填充空行 | |
| writer_write_string(writer, buffer->data + 1, buffer->length - 1); | |
| row++; | |
| } | |
| } | |
| if (row_index > range->row_upper_bound) { | |
| break; | |
| } | |
| int col = range->col_lower_bound; | |
| int col_index = -1; | |
| bool contains = false; | |
| while ((pointer = strchr(pointer, '<')) != NULL) { | |
| if (!strncmp(pointer, "</row>", 6)) { // 本行结束 | |
| pointer += 6; | |
| if (col <= range->col_upper_bound) { | |
| buffer_clear(buffer); | |
| while (col <= range->col_upper_bound) { // 填充空列 | |
| buffer_push(buffer, range_delimiter(range, col)); | |
| col++; | |
| } | |
| writer_write_string(writer, buffer->data + 1, buffer->length - 1); | |
| } | |
| row++; | |
| break; | |
| } else if (contains && !strncmp(pointer, "</c> ", 4)) { // 本列结束 | |
| pointer += 4; | |
| writer_write_buffer(writer, buffer, range_delimiter(range, col)); | |
| col++; | |
| } else if (!strncmp(pointer, "<c ", 3)) { | |
| pointer = strstr(pointer, " r=\"") + 4; | |
| col_index = col_to_index(pointer, take_with(pointer, isupper)); | |
| pointer = strchr(pointer, '>'); | |
| int col_upper_chunk = min(col_index, range->col_upper_bound + 1); | |
| if (col < col_upper_chunk) { | |
| buffer_clear(buffer); | |
| while (col < col_upper_chunk) { // 填充空列 | |
| buffer_push(buffer, range_delimiter(range, col)); | |
| col++; | |
| } | |
| writer_write_string(writer, buffer->data + 1, buffer->length - 1); | |
| } | |
| buffer_clear(buffer); | |
| contains = range->col_lower_bound <= col_index && col_index <= range->col_upper_bound; | |
| } else if (contains && !strncmp(pointer, "<v>", 3)) { | |
| pointer = read_text(pointer, "</v>", buffer); | |
| } else if (contains && !strncmp(pointer, "<is>", 4)) { | |
| pointer = read_text(pointer, "</is>", buffer); | |
| } else { | |
| pointer = strchr(pointer, '>'); | |
| } | |
| } | |
| } | |
| buffer_delete(buffer); | |
| } | |
| /* Main */ | |
| int main(int argc, char* argv[]) { | |
| String program = argv[0]; | |
| Reader reader = NULL; | |
| Writer writer = NULL; | |
| Range range = NULL; | |
| if (argc == 4) { | |
| reader = reader_new(argv[1]); | |
| range = range_new(argv[2]); | |
| writer = writer_new(argv[3]); | |
| } else { | |
| help(program); | |
| } | |
| transform(reader, writer, range); | |
| range_delete(range); | |
| reader_delete(reader); | |
| writer_delete(writer); | |
| return EXIT_SUCCESS; | |
| } | |
| /* Helpers */ | |
| void help(String program) { | |
| fprintf(stderr, "Usage: %s <XML> <RANGE> <CSV>\n\n", program); | |
| fprintf(stderr, "PARAMETERS:\n\n"); | |
| fprintf(stderr, " XML Input xml file name (required).\n"); | |
| fprintf(stderr, " RANGE Data range (required).\n"); | |
| fprintf(stderr, " CSV Output csv file name (required).\n"); | |
| fprintf(stderr, "\nAuthor: Zhang, Zepeng <[email protected]>\n"); | |
| exit(EXIT_FAILURE); | |
| } | |
| int min(int a, int b) { | |
| return a < b ? a : b; | |
| } | |
| int max(int a, int b) { | |
| return a > b ? a : b; | |
| } | |
| int take_with(String s, Predicate predicate) { | |
| int index = 0; | |
| while (s != NULL && s[index] != 0 && predicate(s[index])) { | |
| index++; | |
| } | |
| return index; | |
| } | |
| int col_to_index(String s, int length) { | |
| int column = 0; | |
| for (int index = 0; s != NULL && index < length; index++) { | |
| column = column * 26 + s[index] - 64; | |
| } | |
| return column - 1; | |
| } | |
| int row_to_index(String s, int length) { | |
| return string_to_dec(s, length) - 1; | |
| } | |
| int string_to_dec(String s, int length) { | |
| int value = 0; | |
| for (int index = 0; s != NULL && index < length; index++) { | |
| value = value * 10 + s[index] - '0'; | |
| } | |
| return value; | |
| } | |
| int string_to_hex(String s, int length) { | |
| int value = 0; | |
| for (int index = 0; s != NULL && index < length; index++) { | |
| value *= 16; | |
| if (isdigit(s[index])) { | |
| value += s[index] - '0'; | |
| } else if (isupper(s[index])) { | |
| value += s[index] - 'A' + 10; | |
| } | |
| } | |
| return value; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment