Last active
January 23, 2019 18:06
-
-
Save xigh/0a8f520010322a6bcd3a0a96732f087c to your computer and use it in GitHub Desktop.
Retrieve openoffice documents from formatted harddrive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1- calls mmap on the disk device | |
2- looks for zip headers | |
3- retrieves the begining of the zip file | |
4- check if it contains a mimetype file with the expected mimetype | |
5- save the result to a file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#include <sys/mman.h> | |
#include <sys/types.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
#include <sys/stat.h> | |
#include <assert.h> | |
#include <stdio.h> | |
#include <errno.h> | |
#include <string.h> | |
#include <ctype.h> | |
#include <sys/ioctl.h> | |
#include <linux/fs.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
typedef unsigned char byte; | |
void dump(byte* ptr, int64_t off, int64_t len, int lines, char *indent) { | |
int64_t i, j; | |
byte c; | |
for (i = 0; i < len-off; i++) { | |
if (i%16 == 0) { | |
if (i > 0) { | |
printf(" "); | |
for (j = i - 16; j < i; j++) { | |
c = ptr[off+j]; | |
if (!isprint(c)) { | |
c = '-'; | |
} | |
printf("%c ", c); | |
} | |
printf("\n"); | |
if (i/16 >= lines) { | |
return; | |
} | |
} | |
printf("%s%016lx: ", indent, off+i); | |
} | |
printf("%02x ", ptr[off+i]); | |
} | |
j = i; | |
while (j%16 != 0) { | |
printf("-- "); | |
j++; | |
} | |
printf(" "); | |
for (j = i & ~0xf; j < len-off; j++) { | |
c = ptr[off+j]; | |
if (!isprint(c)) { | |
c = '-'; | |
} | |
printf("%c ", c); | |
} | |
printf("\n"); | |
} | |
byte pkzip[] = { 0x50, 0x4b, 0x05, 0x06 }; | |
byte local[] = { 0x50, 0x4b, 0x03, 0x04 }; | |
char odt[] = "application/vnd.oasis.opendocument.text"; | |
int odts = sizeof(odt) - 1; | |
char ods[] = "application/vnd.oasis.opendocument.spreadsheet"; | |
int odss = sizeof(ods) - 1; | |
char odp[] = "application/vnd.oasis.opendocument.presentation"; | |
int odps = sizeof(odp) - 1; | |
char *ext[] = { "B", "kB", "MB", "GB", "TB" }; | |
uint16_t getUint16(byte *data, int64_t off) | |
{ | |
uint16_t lo = data[off + 0]; | |
uint16_t hi = data[off + 1]; | |
return lo + (hi << 8); | |
} | |
uint32_t getUint32(byte *data, int64_t off) | |
{ | |
uint32_t lo = getUint16(data, off); | |
uint32_t hi = getUint16(data, off + 2); | |
return lo + (hi << 16); | |
} | |
int savezip(byte *data, int64_t off, int64_t len, int idx) | |
{ | |
int64_t head = getUint32(data, off + 16); | |
int64_t size = getUint32(data, off + 12); | |
int64_t pos; | |
uint16_t r, n, m; | |
uint32_t o, c, u; | |
char *path, *ext = 0; | |
int err = 0; | |
uint16_t disk = getUint16(data, off + 4); | |
uint16_t start = getUint16(data, off + 6); | |
uint16_t recs = 0, nrecs = getUint16(data, off + 10); | |
int hasMimetype = 0; | |
int hasContent = 0; | |
if (disk != 0 || start != 0) { | |
printf(" ** bad header **\n\n"); | |
return 1; | |
} | |
// dump(data, off, len, 4, ""); | |
// printf("disk: %d\n", getUint16(data, off + 4)); | |
// printf("start: %d\n", getUint16(data, off + 6)); | |
// printf("numdisks: %d\n", getUint16(data, off + 8)); | |
// printf("records: %d\n", nrecs); | |
// printf("size: %ld\n", size); | |
// printf("offset: %ld [%016lx]\n", head, off-head); | |
pos = off - size; | |
while (err == 0 && pos < off) | |
{ | |
// dump(data, pos, len, 4); | |
r = getUint16(data, pos + 10); | |
c = getUint32(data, pos + 20); | |
u = getUint32(data, pos + 24); | |
// printf("\tcompr: %d\n", r); | |
// printf("\tcsize: %d\n", c); | |
// printf("\tusize: %d\n", u); | |
n = getUint16(data, pos + 28); | |
m = getUint16(data, pos + 30); | |
o = getUint32(data, pos + 42); | |
// printf("\tnamelen: %d\n", n); | |
// printf("\textralen: %d\n", m); | |
// printf("\tf-offset: %d\n", o); | |
// printf("\t%08x: %s (%d -> %d) [%d]\n", o, path, c, u, r); | |
// dump(data, off - head - size + o, len, 4, "\t + "); | |
if (memcmp(data + off - head - size + o, local, sizeof local) != 0) { | |
printf("\t ** invalid head\n"); | |
err = 1; | |
} | |
else { | |
uint16_t n2 = getUint16(data, off - head - size + o + 26); | |
uint16_t m2 = getUint16(data, off - head - size + o + 28); | |
if (n != n2) { | |
printf("\t ** invalid head [2]\n"); | |
err = 1; | |
} | |
else { | |
path = strndup(data + off - head - size + o + 30, n2); | |
if (path != 0) { | |
if (strcmp(path, "mimetype") == 0) { | |
hasMimetype = 1; | |
if (c == u && r == 0) { | |
byte *ptr = data + off - head - size + o + 30 + n2 + m2; | |
if ((c == odts) && (memcmp(ptr, odt, odts) == 0)) | |
ext = "odt"; | |
else if ((c == odss) && (memcmp(ptr, ods, odss) == 0)) | |
ext = "ods"; | |
else if ((c == odps) && (memcmp(ptr, odp, odps) == 0)) | |
ext = "odp"; | |
// dump(data, off - head - size + o + 30 + n2 + m2, len, 3, "\t"); | |
} | |
} | |
if (strcmp(path, "content.xml") == 0) { | |
hasContent = 1; | |
} | |
printf("\t%08x: %s [%s] (%d -> %d) [%d]\n", o, path, ext ? ext : "", c, u, r); | |
free(path); | |
} else { | |
printf("\t ** could not allocate memory for path\n"); | |
err = 1; | |
} | |
} | |
} | |
recs += 1; | |
pos += 46 + n + m; | |
// printf("\n"); | |
} | |
printf("\trecs=%d/%d, zip at %016lx\n", recs, nrecs, off - head - size); | |
if (recs == nrecs && err == 0) { | |
char *name; | |
int l = 0; | |
if (hasMimetype && hasContent && ext != 0) { | |
l = asprintf(&name, "tmp2/file_%06d.%s", idx, ext); | |
} else { | |
l = asprintf(&name, "tmp/file_%06d.zip", idx); | |
} | |
if (l > 0) { | |
int fd = open(name, O_WRONLY|O_CREAT|O_TRUNC, 0664); | |
if (fd >= 0) { | |
ssize_t sz = write(fd, data + off - head - size, head + size + 46 + m + n); | |
if (sz < 0) { | |
printf("## failed to save %s\n", name); | |
err = 0; | |
} | |
close(fd); | |
} else { | |
printf("## failed to open %s\n", name); | |
err = 0; | |
} | |
printf(" => file saved to %s\n", name); | |
free(name); | |
} else { | |
printf("## could not allocate memory for path\n"); | |
err = 0; | |
} | |
} | |
printf("\n"); | |
return err; | |
} | |
int main(int argc, char **argv) | |
{ | |
int z, n, fd, err; | |
struct stat st; | |
byte *data, *pk; | |
int64_t off, sz; | |
err = stat(argv[1], &st); | |
if (err == -1) { | |
printf("stat failed with: %s\n", strerror(errno)); | |
return 0; | |
} | |
fd = open(argv[1], O_RDONLY); | |
if (fd == -1) { | |
printf("open failed with: %s\n", strerror(errno)); | |
return 0; | |
} | |
if (st.st_size == 0) { | |
err = ioctl(fd, BLKGETSIZE64, &st.st_size); | |
if (err == -1) { | |
printf("ioctl failed with: %s\n", strerror(errno)); | |
return 0; | |
} | |
} | |
n = 0; | |
sz = st.st_size; | |
while (sz > 1024ull) { | |
sz /= 1024ull; | |
n += 1; | |
} | |
printf("device size: %zd (%zd%s)\n\n", st.st_size, sz, ext[n]); | |
// dump(pkzip, 0, sizeof pkzip, 1); | |
// printf("\n"); | |
data = (byte *) mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); | |
if (data == MAP_FAILED) { | |
printf("mmap failed with: %s\n", strerror(errno)); | |
return 0; | |
} | |
z = 0; | |
pk = data+0; | |
for (n = 0; n < 20000; n++) { | |
off = pk-data; | |
pk = memmem(pk, st.st_size-off, pkzip, sizeof(pkzip)); | |
if (pk == 0) { | |
break; | |
} | |
off = pk-data; | |
printf("%d: found PK at offset: %zd (%.1f%%)\n", n + 1, off, (100.0 * (float) off) / (float) st.st_size); | |
if (0 == savezip(data, off, st.st_size, z)) { | |
z += 1; | |
} | |
if (z > 2000) { | |
break; | |
} | |
pk += 4; | |
} | |
printf("found %d zip files\n", z); | |
err = munmap(data, st.st_size); | |
if (err != 0) { | |
printf("munmap failed with: %s\n", strerror(errno)); | |
return 0; | |
} | |
close(fd); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment