Created
September 30, 2019 07:30
-
-
Save sergey-tihon/6d0aba1c8653cdd0e6f95b67121d859b to your computer and use it in GitHub Desktop.
Fast file parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
the -0.071549 0.093459 0.023738 -0.090339 0.056123 0.32547 -0.39796 -0.092139 0.061181 -0.1895 0.13061 0.14349 0.011479 0.38158 0.5403 -0.14088 0.24315 0.23036 -0.55339 0.048154 0.45662 3.2338 0.020199 0.049019 -0.014132 0.076017 -0.11527 0.2006 -0.077657 0.24328 0.16368 -0.34118 -0.06607 0.10152 0.038232 -0.17668 -0.88153 -0.33895 -0.035481 -0.55095 -0.016899 -0.43982 0.039004 0.40447 -0.2588 0.64594 0.26641 0.28009 -0.024625 0.63302 -0.317 0.10271 0.30886 0.097792 -0.38227 0.086552 0.047075 0.23511 -0.32127 -0.28538 0.1667 -0.0049707 -0.62714 -0.24904 0.29713 0.14379 -0.12325 -0.058178 -0.001029 -0.082126 0.36935 -0.00058442 0.34286 0.28426 -0.068599 0.65747 -0.029087 0.16184 0.073672 -0.30343 0.095733 -0.5286 -0.22898 0.064079 0.015218 0.34921 -0.4396 -0.43983 0.77515 -0.87767 -0.087504 0.39598 0.62362 -0.26211 -0.30539 -0.022964 0.30567 0.06766 0.15383 -0.11211 -0.09154 0.082562 0.16897 -0.032952 -0.28775 -0.2232 -0.090426 1.2407 -0.18244 -0.0075219 -0.041388 -0.011083 0.078186 0.38511 0.23334 0.14414 -0.0009107 -0.26388 -0.20481 0.10099 0.14076 0.28834 -0.045429 0.37247 0.13645 -0.67457 0.22786 0.12599 0.029091 0.030428 -0.13028 0.19408 0.49014 -0.39121 -0.075952 0.074731 0.18902 -0.16922 -0.26019 -0.039771 -0.24153 0.10875 0.30434 0.036009 1.4264 0.12759 -0.073811 -0.20418 0.0080016 0.15381 0.20223 0.28274 0.096206 -0.33634 0.50983 0.32625 -0.26535 0.374 -0.30388 -0.40033 -0.04291 -0.067897 -0.29332 0.10978 -0.045365 0.23222 -0.31134 -0.28983 -0.66687 0.53097 0.19461 0.3667 0.26185 -0.65187 0.10266 0.11363 -0.12953 -0.68246 -0.18751 0.1476 1.0765 -0.22908 -0.0093435 -0.20651 -0.35225 -0.2672 -0.0034307 0.25906 0.21759 0.66158 0.1218 0.19957 -0.20303 0.34474 -0.24328 0.13139 -0.0088767 0.33617 0.030591 0.25577 | |
, 0.17651 0.29208 -0.0020768 -0.37523 0.0049139 0.23979 -0.28893 -0.014643 -0.10993 0.15592 0.20627 0.47675 0.099907 -0.14058 0.21114 0.12126 -0.31831 -0.089433 -0.090553 -0.31962 0.21319 2.4844 -0.077521 -0.084279 0.20186 0.26084 -0.40411 -0.19127 0.24715 0.22394 -0.063437 0.20379 -0.18463 -0.088413 0.024169 -0.28769 -0.61246 -0.12683 -0.088273 0.18331 -0.53161 -0.1997 -0.26703 0.15312 -0.015239 -0.082844 0.47856 -0.29612 0.11168 -0.02579 -0.011697 0.19923 -0.14267 0.6625 -0.051739 -0.16938 -0.15635 0.092806 0.32548 0.11724 0.28788 -0.060651 -0.14153 0.16668 0.26861 -0.031001 -0.39665 0.35304 0.2385 0.12388 0.45698 -0.12559 -0.12804 0.37449 0.2446 0.23073 0.20808 0.051258 -0.21816 -0.036409 -0.0388 -0.042487 -0.30779 -0.025449 0.22532 0.045538 -0.48934 -0.13988 0.17394 -0.46137 -0.26555 0.15473 0.063816 -0.17022 -0.15762 0.075765 0.12151 -0.4934 -0.10909 0.034487 0.29947 0.01869 -0.16534 0.016679 0.16341 -0.27418 0.077797 1.4023 0.025275 0.094725 -0.040735 -0.10642 0.023364 0.079143 -0.16615 -0.23013 -0.14071 0.40159 -0.34951 0.018545 0.22434 0.76922 0.24722 0.14936 0.42368 -0.72059 -0.038541 0.15522 0.33596 -0.43077 -0.026925 -0.37733 0.24271 -0.46495 0.45783 0.23693 0.079361 -0.32244 -0.42434 -0.11138 0.55426 0.085153 -0.020581 -0.046386 1.2467 0.13177 0.067092 -0.5778 0.013586 -0.071274 0.017311 0.089781 0.19857 -0.032205 0.64843 -0.23797 -0.19676 0.20203 0.21074 -0.50347 0.026823 -0.045444 -0.22642 -0.19977 -0.12138 0.16941 0.061998 0.42631 -0.088383 0.45756 0.077774 0.061342 0.4571 -0.17787 -0.14597 0.32654 0.002443 -0.11886 0.10081 -0.020011 1.0366 -0.39814 -0.6818 0.23685 -0.20396 -0.17668 -0.31385 0.14834 -0.052187 0.0613 -0.32582 0.19153 -0.15469 -0.14679 0.046971 0.032325 -0.22006 -0.20774 -0.23189 -0.10814 | |
. 0.12289 0.58037 -0.069635 -0.50288 0.10503 0.39945 -0.38635 -0.084279 0.12219 0.080312 0.32337 0.47579 -0.038375 -0.00709 0.41524 0.32121 -0.21185 0.36144 -0.055623 -0.030512 0.42854 2.8547 -0.14623 -0.17557 0.31197 -0.13118 0.033298 0.13093 0.089889 -0.12417 0.0023396 -0.068954 -0.10754 -0.11551 -0.31052 -0.12097 -0.46691 -0.0836 -0.037664 -0.071779 -0.11899 -0.20381 -0.12424 0.46339 -0.19828 -0.0080365 0.53718 0.031739 0.34331 0.0079704 0.0048744 0.030592 -0.17615 0.82342 -0.13793 -0.10075 -0.12686 0.074735 -0.088719 -0.042719 0.076624 0.089263 0.064445 -0.031958 0.15254 -0.10384 0.076604 0.34099 0.24331 -0.10452 0.40714 -0.1826 -0.040667 0.50878 0.08076 0.22759 -0.042162 -0.18171 -0.095025 0.030334 0.088202 -3.9843e-06 -0.0039877 0.15724 0.33167 0.08471 -0.25919 -0.41384 0.2992 -0.54255 0.032129 0.1003 0.44202 0.044682 -0.090681 -0.10481 -0.1186 -0.31972 -0.2079 -0.040203 -0.022988 0.22824 0.0055238 0.12568 -0.1464 -0.14904 -0.11561 1.0517 -0.19498 0.083958 0.044812 -0.12965 -0.093468 0.21237 -0.088332 -0.1868 0.26521 0.13097 -0.048102 -0.22467 0.28412 0.34907 0.34833 0.017877 0.30504 -0.83453 0.048856 -0.1933 0.20764 -0.49701 -0.18747 -0.076801 0.15558 -0.46844 0.40944 0.21386 0.082392 -0.26491 -0.21224 -0.13293 0.14738 -0.14192 0.18994 -0.15587 1.0738 0.40789 -0.27452 -0.18431 0.00068679 -0.087115 0.19672 0.40918 -0.35462 -0.06326 0.4492 -0.060568 -0.041636 0.20531 0.017025 -0.58448 0.075441 0.082116 -0.46008 0.012393 -0.02531 0.14177 -0.092192 0.34505 -0.52136 0.57304 0.011973 0.033196 0.29672 -0.27899 0.19979 0.25666 0.082079 -0.078436 0.093719 0.24202 1.3495 -0.30434 -0.30936 0.42047 -0.079068 -0.14819 -0.089404 0.0668 0.22405 0.27226 -0.035236 0.17688 -0.0536 0.0070031 -0.033006 -0.080021 -0.24451 -0.039174 -0.16236 -0.096652 | |
of 0.052924 0.25427 0.31353 -0.35613 0.029629 0.51034 -0.10716 0.15195 0.057698 0.06149 0.06116 0.39911 -0.00029018 0.31978 0.43257 -0.14708 0.054842 0.27079 -0.14051 -0.30101 0.16313 3.0013 0.22231 -0.14279 0.083705 0.089866 -0.52706 -0.089661 0.27311 0.31413 -0.04081 0.060557 -0.042656 0.24178 -0.29187 0.22575 -0.6298 -0.14641 -0.22429 -0.056621 -0.17776 -0.64269 0.51626 0.22305 0.12124 0.48074 0.41743 0.54805 0.40955 0.42407 0.049906 -0.32574 0.46298 0.19245 0.28143 0.2966 0.063593 -0.11906 -0.15016 -0.04984 0.40675 0.010675 -0.69127 0.048729 0.26391 0.30961 -0.11921 0.25548 -0.28219 -0.037413 0.36461 0.027129 0.20786 0.53325 0.50148 0.72381 0.065292 -0.078716 -0.10537 -0.08081 -0.2096 0.040902 -0.88101 0.24715 0.16146 0.10361 0.19705 -0.27365 0.89902 -0.29981 0.036165 0.041238 0.60105 -0.18911 -0.43887 -0.14097 0.44073 -0.19999 0.28834 -0.25458 -0.10985 -0.0027379 0.091735 0.17021 -0.16305 -0.57439 0.37063 1.7262 -0.24656 0.51681 -0.15355 -0.15553 0.019783 0.1803 0.38178 0.094443 -0.55158 -0.20242 -0.4386 -0.42108 0.27525 0.58977 0.026655 0.16401 0.13893 -0.68692 0.51071 0.29278 0.022041 -0.18156 -0.64905 0.16923 -0.01059 0.21785 -0.27242 0.27967 0.1395 -0.70559 -0.26034 -0.44017 0.15303 0.19693 -0.096838 0.14827 1.1294 -0.31267 0.0099916 -0.48623 0.080584 0.35608 -0.19925 0.19306 -0.2004 -0.44194 0.75766 0.24487 -0.18903 0.26653 -0.21339 -0.54083 0.40532 -0.02796 -0.13398 -0.11086 0.059506 0.24052 -0.59739 -0.0024069 -0.18593 1.042 -0.12969 0.20813 0.33305 -0.1278 0.085662 -0.076422 0.31407 -0.23784 -0.054838 0.011369 0.845 -0.34165 0.093983 0.082445 -0.27777 -0.44226 -0.063078 0.37274 0.054468 0.24197 -0.040886 0.3894 -0.10509 0.23372 0.096027 -0.30324 0.24488 -0.086254 -0.41917 0.46496 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class DataLoader | |
{ | |
public void Load(string filePath, int wordVectorLength, int capacity = 1_200_000) | |
{ | |
_wordVectorLength = wordVectorLength; | |
_wordVectors = new float[capacity, wordVectorLength]; | |
_word2Index = new Dictionary<string, int>(capacity, StringComparer.InvariantCultureIgnoreCase); | |
var updateDict = new ActionBlock<(string word, int pos)>(x => | |
{ | |
if (_word2Index.ContainsKey(x.word)) | |
return; | |
_word2Index.Add(x.word, x.pos); | |
}); | |
var actionBlock = new ActionBlock<(string line, int pos)>(x => | |
{ | |
var ind = x.line.IndexOf(' '); | |
var word = x.line.Substring(0, ind); | |
var span = x.line.AsSpan(); | |
for (var i = 0; i < wordVectorLength; i++) | |
{ | |
while (span[ind] == ' ') ind++; | |
var pos = ind; | |
while (pos < span.Length && span[pos] != ' ') pos++; | |
var s = span.Slice(ind, pos - ind); | |
_wordVectors[x.pos, i] = float.Parse(s); | |
ind = pos; | |
} | |
updateDict.Post((word, x.pos)); | |
}, new ExecutionDataflowBlockOptions | |
{ | |
MaxDegreeOfParallelism = Environment.ProcessorCount | |
}); | |
using var fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read); | |
using var bufferedStream = new BufferedStream(fileStream); | |
using var streamReader = new StreamReader(bufferedStream, Encoding.UTF8); | |
string line; | |
var position = 0; | |
while ((line = streamReader.ReadLine()) != null) | |
actionBlock.Post((line, position++)); | |
actionBlock.Complete(); | |
actionBlock.Completion.GetAwaiter().GetResult(); | |
} | |
private int _wordVectorLength; | |
private float[,] _wordVectors; | |
private Dictionary<string, int> _word2Index; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment