Created
January 8, 2025 14:22
-
-
Save nan-wang/430c2b86f07675304d6c401a3cddfe6a to your computer and use it in GitHub Desktop.
inspect_into_modernbert_tokenizer.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyM32Pf1T/gYmX06lwSM61Wl", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| }, | |
| "widgets": { | |
| "application/vnd.jupyter.widget-state+json": { | |
| "f8d50d3263e94aa48d49b6df2c51d7e3": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HBoxModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HBoxView", | |
| "box_style": "", | |
| "children": [ | |
| "IPY_MODEL_bf53f11922ae42eb995fcb00170fb27f", | |
| "IPY_MODEL_c9d846ed39d4491cac3daef637e0a498", | |
| "IPY_MODEL_ace914fcaf864e16bef11a5f97ce626c" | |
| ], | |
| "layout": "IPY_MODEL_6b771547d7524857bbc0854bc62fa85f" | |
| } | |
| }, | |
| "bf53f11922ae42eb995fcb00170fb27f": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HTMLModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HTMLView", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_663d82b5194149848cd22895d024ea7c", | |
| "placeholder": "", | |
| "style": "IPY_MODEL_361f137127b34ea3986e42ba05926cb6", | |
| "value": "tokenizer_config.json: 100%" | |
| } | |
| }, | |
| "c9d846ed39d4491cac3daef637e0a498": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "FloatProgressModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "ProgressView", | |
| "bar_style": "success", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_760b75b6da344c1d83ea4362668a22bb", | |
| "max": 20837, | |
| "min": 0, | |
| "orientation": "horizontal", | |
| "style": "IPY_MODEL_e8d97ad24bb74bdf97bd2a21e17a2bb0", | |
| "value": 20837 | |
| } | |
| }, | |
| "ace914fcaf864e16bef11a5f97ce626c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HTMLModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HTMLView", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_37085e6939734a5fa888b31f602b9b97", | |
| "placeholder": "", | |
| "style": "IPY_MODEL_40453e191280454eb70bce4b8250264c", | |
| "value": " 20.8k/20.8k [00:00<00:00, 1.29MB/s]" | |
| } | |
| }, | |
| "6b771547d7524857bbc0854bc62fa85f": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "663d82b5194149848cd22895d024ea7c": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "361f137127b34ea3986e42ba05926cb6": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "DescriptionStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "description_width": "" | |
| } | |
| }, | |
| "760b75b6da344c1d83ea4362668a22bb": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "e8d97ad24bb74bdf97bd2a21e17a2bb0": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "ProgressStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "bar_color": null, | |
| "description_width": "" | |
| } | |
| }, | |
| "37085e6939734a5fa888b31f602b9b97": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "40453e191280454eb70bce4b8250264c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "DescriptionStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "description_width": "" | |
| } | |
| }, | |
| "5ea7b7bbb88c468e9ba06bd82c48c097": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HBoxModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HBoxView", | |
| "box_style": "", | |
| "children": [ | |
| "IPY_MODEL_02210c97d2cb481884448fa21a72c230", | |
| "IPY_MODEL_ae1417b2543d4ba7b22390464478d169", | |
| "IPY_MODEL_b9ebb699dfec405ca772cc7ad114647c" | |
| ], | |
| "layout": "IPY_MODEL_e3540f80ed9e4c01bad81112f15617d4" | |
| } | |
| }, | |
| "02210c97d2cb481884448fa21a72c230": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HTMLModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HTMLView", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_40a913ac87ad4522ac5cd13c3f3643c8", | |
| "placeholder": "", | |
| "style": "IPY_MODEL_98f36fd893a642a3803c7ebf05753fd3", | |
| "value": "tokenizer.json: 100%" | |
| } | |
| }, | |
| "ae1417b2543d4ba7b22390464478d169": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "FloatProgressModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "ProgressView", | |
| "bar_style": "success", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_8dca7f9d463c4ecdb0727f8e01724888", | |
| "max": 2132967, | |
| "min": 0, | |
| "orientation": "horizontal", | |
| "style": "IPY_MODEL_2351dc4374dd42ca950e003f6725bdb2", | |
| "value": 2132967 | |
| } | |
| }, | |
| "b9ebb699dfec405ca772cc7ad114647c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HTMLModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HTMLView", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_7c4930fd56a547f38a4a3435a37a14b8", | |
| "placeholder": "", | |
| "style": "IPY_MODEL_c37f11f5e33141b49c9a0ab549ac048d", | |
| "value": " 2.13M/2.13M [00:00<00:00, 25.1MB/s]" | |
| } | |
| }, | |
| "e3540f80ed9e4c01bad81112f15617d4": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "40a913ac87ad4522ac5cd13c3f3643c8": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "98f36fd893a642a3803c7ebf05753fd3": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "DescriptionStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "description_width": "" | |
| } | |
| }, | |
| "8dca7f9d463c4ecdb0727f8e01724888": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "2351dc4374dd42ca950e003f6725bdb2": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "ProgressStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "bar_color": null, | |
| "description_width": "" | |
| } | |
| }, | |
| "7c4930fd56a547f38a4a3435a37a14b8": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "c37f11f5e33141b49c9a0ab549ac048d": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "DescriptionStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "description_width": "" | |
| } | |
| }, | |
| "dd4f732e14ee44fa96df3246d5da7753": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HBoxModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HBoxView", | |
| "box_style": "", | |
| "children": [ | |
| "IPY_MODEL_e7262952b2274905b99d2535ebbb0ea9", | |
| "IPY_MODEL_7c5105f8a2774b5a9bc42e5f2f392e5e", | |
| "IPY_MODEL_9f96506d464e4dc080e907b2ba2c9a40" | |
| ], | |
| "layout": "IPY_MODEL_9102425f1c624ab5bc3cdf7468a43c69" | |
| } | |
| }, | |
| "e7262952b2274905b99d2535ebbb0ea9": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HTMLModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HTMLView", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_dcab4e74b4204273bea33b1560751b36", | |
| "placeholder": "", | |
| "style": "IPY_MODEL_f441f34ad9bc46b18786ff7761d8e593", | |
| "value": "special_tokens_map.json: 100%" | |
| } | |
| }, | |
| "7c5105f8a2774b5a9bc42e5f2f392e5e": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "FloatProgressModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "ProgressView", | |
| "bar_style": "success", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_861578f5651646b887d4cb0a4f408cbd", | |
| "max": 694, | |
| "min": 0, | |
| "orientation": "horizontal", | |
| "style": "IPY_MODEL_913fade1dce94ce787c453b33870b343", | |
| "value": 694 | |
| } | |
| }, | |
| "9f96506d464e4dc080e907b2ba2c9a40": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_dom_classes": [], | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "HTMLModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_view_module_version": "1.5.0", | |
| "_view_name": "HTMLView", | |
| "description": "", | |
| "description_tooltip": null, | |
| "layout": "IPY_MODEL_7bdda64602ce4b62848b877ab243c25d", | |
| "placeholder": "", | |
| "style": "IPY_MODEL_ccc8f3103c444c98a236e2d385e5d758", | |
| "value": " 694/694 [00:00<00:00, 34.5kB/s]" | |
| } | |
| }, | |
| "9102425f1c624ab5bc3cdf7468a43c69": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "dcab4e74b4204273bea33b1560751b36": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "f441f34ad9bc46b18786ff7761d8e593": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "DescriptionStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "description_width": "" | |
| } | |
| }, | |
| "861578f5651646b887d4cb0a4f408cbd": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "913fade1dce94ce787c453b33870b343": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "ProgressStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "bar_color": null, | |
| "description_width": "" | |
| } | |
| }, | |
| "7bdda64602ce4b62848b877ab243c25d": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "model_module_version": "1.2.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.2.0", | |
| "_model_name": "LayoutModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "LayoutView", | |
| "align_content": null, | |
| "align_items": null, | |
| "align_self": null, | |
| "border": null, | |
| "bottom": null, | |
| "display": null, | |
| "flex": null, | |
| "flex_flow": null, | |
| "grid_area": null, | |
| "grid_auto_columns": null, | |
| "grid_auto_flow": null, | |
| "grid_auto_rows": null, | |
| "grid_column": null, | |
| "grid_gap": null, | |
| "grid_row": null, | |
| "grid_template_areas": null, | |
| "grid_template_columns": null, | |
| "grid_template_rows": null, | |
| "height": null, | |
| "justify_content": null, | |
| "justify_items": null, | |
| "left": null, | |
| "margin": null, | |
| "max_height": null, | |
| "max_width": null, | |
| "min_height": null, | |
| "min_width": null, | |
| "object_fit": null, | |
| "object_position": null, | |
| "order": null, | |
| "overflow": null, | |
| "overflow_x": null, | |
| "overflow_y": null, | |
| "padding": null, | |
| "right": null, | |
| "top": null, | |
| "visibility": null, | |
| "width": null | |
| } | |
| }, | |
| "ccc8f3103c444c98a236e2d385e5d758": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "model_module_version": "1.5.0", | |
| "state": { | |
| "_model_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_model_name": "DescriptionStyleModel", | |
| "_view_count": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "_view_module_version": "1.2.0", | |
| "_view_name": "StyleView", | |
| "description_width": "" | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/nan-wang/430c2b86f07675304d6c401a3cddfe6a/inspect_into_modernbert_tokenizer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "iCAXSlNU9bPo", | |
| "outputId": "a71f6cfa-607d-4dd1-f960-24225ab5a68a" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Collecting git+https://github.com/huggingface/transformers.git\n", | |
| " Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-t53_uf7x\n", | |
| " Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-t53_uf7x\n", | |
| " Resolved https://github.com/huggingface/transformers.git to commit 59e5b3f01b7773439671c3a827348ba87dc8b92a\n", | |
| " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
| " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
| " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
| "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (3.16.1)\n", | |
| "Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.27.0)\n", | |
| "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (1.26.4)\n", | |
| "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (24.2)\n", | |
| "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (6.0.2)\n", | |
| "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2024.11.6)\n", | |
| "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (2.32.3)\n", | |
| "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.21.0)\n", | |
| "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (0.4.5)\n", | |
| "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.48.0.dev0) (4.67.1)\n", | |
| "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (2024.10.0)\n", | |
| "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers==4.48.0.dev0) (4.12.2)\n", | |
| "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.4.0)\n", | |
| "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (3.10)\n", | |
| "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2.2.3)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.48.0.dev0) (2024.12.14)\n", | |
| "Building wheels for collected packages: transformers\n", | |
| " Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10331157 sha256=8f6cebb74c7ece1c7260b8ad5307cbaab5a6daea3db3b51ae83b80f2623b3d16\n", | |
| " Stored in directory: /tmp/pip-ephem-wheel-cache-4o2183us/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16\n", | |
| "Successfully built transformers\n", | |
| "Installing collected packages: transformers\n", | |
| " Attempting uninstall: transformers\n", | |
| " Found existing installation: transformers 4.47.1\n", | |
| " Uninstalling transformers-4.47.1:\n", | |
| " Successfully uninstalled transformers-4.47.1\n", | |
| "Successfully installed transformers-4.48.0.dev0\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip install git+https://github.com/huggingface/transformers.git" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import re\n", | |
| "from transformers import AutoTokenizer\n" | |
| ], | |
| "metadata": { | |
| "id": "WboGIDSrAiGW" | |
| }, | |
| "execution_count": 2, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "patterns = [\n", | |
| " # Single-line comments (must have content after the symbol)\n", | |
| " re.compile(r'^\\s*(#\\S.*|//\\S.*|--\\S.*|;\\S.*)'), # Python, C/C++, Java, JavaScript, SQL, Matlab\n", | |
| " # Multi-line comments (must have content inside the comment block)\n", | |
| " re.compile(r'^\\s*(/\\*.*\\*/|\\'\\'\\'[^\\'\\r\\n]*\\'\\'\\'|\\\"\\\"\\\"[^\\\"\\\\r\\\\n]*\\\"\\\"\\\")'), # C/C++, Java, JavaScript, Go, Rust, Python\n", | |
| "]\n", | |
| "\n", | |
| "def is_comment(line):\n", | |
| " # Check each compiled pattern for whether it matches the line\n", | |
| " for pattern in patterns:\n", | |
| " if pattern.match(line):\n", | |
| " return True\n", | |
| " return False\n" | |
| ], | |
| "metadata": { | |
| "id": "xotzCAcL-ttS" | |
| }, | |
| "execution_count": 4, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "voc_mbert = AutoTokenizer.from_pretrained(\"answerdotai/ModernBERT-large\").get_vocab()\n", | |
| "\n", | |
| "for k, index in voc_mbert.items():\n", | |
| " if is_comment(k):\n", | |
| " print(f\"{index}: {repr(k)}\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 1000, | |
| "referenced_widgets": [ | |
| "f8d50d3263e94aa48d49b6df2c51d7e3", | |
| "bf53f11922ae42eb995fcb00170fb27f", | |
| "c9d846ed39d4491cac3daef637e0a498", | |
| "ace914fcaf864e16bef11a5f97ce626c", | |
| "6b771547d7524857bbc0854bc62fa85f", | |
| "663d82b5194149848cd22895d024ea7c", | |
| "361f137127b34ea3986e42ba05926cb6", | |
| "760b75b6da344c1d83ea4362668a22bb", | |
| "e8d97ad24bb74bdf97bd2a21e17a2bb0", | |
| "37085e6939734a5fa888b31f602b9b97", | |
| "40453e191280454eb70bce4b8250264c", | |
| "5ea7b7bbb88c468e9ba06bd82c48c097", | |
| "02210c97d2cb481884448fa21a72c230", | |
| "ae1417b2543d4ba7b22390464478d169", | |
| "b9ebb699dfec405ca772cc7ad114647c", | |
| "e3540f80ed9e4c01bad81112f15617d4", | |
| "40a913ac87ad4522ac5cd13c3f3643c8", | |
| "98f36fd893a642a3803c7ebf05753fd3", | |
| "8dca7f9d463c4ecdb0727f8e01724888", | |
| "2351dc4374dd42ca950e003f6725bdb2", | |
| "7c4930fd56a547f38a4a3435a37a14b8", | |
| "c37f11f5e33141b49c9a0ab549ac048d", | |
| "dd4f732e14ee44fa96df3246d5da7753", | |
| "e7262952b2274905b99d2535ebbb0ea9", | |
| "7c5105f8a2774b5a9bc42e5f2f392e5e", | |
| "9f96506d464e4dc080e907b2ba2c9a40", | |
| "9102425f1c624ab5bc3cdf7468a43c69", | |
| "dcab4e74b4204273bea33b1560751b36", | |
| "f441f34ad9bc46b18786ff7761d8e593", | |
| "861578f5651646b887d4cb0a4f408cbd", | |
| "913fade1dce94ce787c453b33870b343", | |
| "7bdda64602ce4b62848b877ab243c25d", | |
| "ccc8f3103c444c98a236e2d385e5d758" | |
| ] | |
| }, | |
| "id": "1D2vvr1aAMRa", | |
| "outputId": "efa6356f-599b-4019-efd2-1e672dbca904" | |
| }, | |
| "execution_count": 7, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "tokenizer_config.json: 0%| | 0.00/20.8k [00:00<?, ?B/s]" | |
| ], | |
| "application/vnd.jupyter.widget-view+json": { | |
| "version_major": 2, | |
| "version_minor": 0, | |
| "model_id": "f8d50d3263e94aa48d49b6df2c51d7e3" | |
| } | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "tokenizer.json: 0%| | 0.00/2.13M [00:00<?, ?B/s]" | |
| ], | |
| "application/vnd.jupyter.widget-view+json": { | |
| "version_major": 2, | |
| "version_minor": 0, | |
| "model_id": "5ea7b7bbb88c468e9ba06bd82c48c097" | |
| } | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "special_tokens_map.json: 0%| | 0.00/694 [00:00<?, ?B/s]" | |
| ], | |
| "application/vnd.jupyter.widget-view+json": { | |
| "version_major": 2, | |
| "version_minor": 0, | |
| "model_id": "dd4f732e14ee44fa96df3246d5da7753" | |
| } | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "30863: ';\"><'\n", | |
| "39485: '-------------------------------------------'\n", | |
| "44136: '-------------------------------------------------'\n", | |
| "12723: ';;'\n", | |
| "29295: '----------------------------'\n", | |
| "5146: '######'\n", | |
| "19628: '--------------------------------------------------------------------------------'\n", | |
| "45263: '--------------------------------------------------'\n", | |
| "15623: '---|'\n", | |
| "48924: '//----------------------------------------------------------------'\n", | |
| "397: '--------'\n", | |
| "2518: '----------------------------------------------------------------'\n", | |
| "22902: '################################'\n", | |
| "28693: '------------------------------'\n", | |
| "9794: '---------'\n", | |
| "34194: ';&#'\n", | |
| "20744: ';\\\\;\\\\'\n", | |
| "22873: '---------------------'\n", | |
| "45599: ';|'\n", | |
| "26577: '#.'\n", | |
| "10326: '#:'\n", | |
| "1835: '####'\n", | |
| "22158: '-->'\n", | |
| "29234: ';_'\n", | |
| "29648: '-----------------------------'\n", | |
| "26077: ';<'\n", | |
| "1013: '--------------------------------'\n", | |
| "32657: ';;;;'\n", | |
| "28505: '------------------------------------------------------------------------------------------------'\n", | |
| "39423: '----------------------------------------------------------------------------------------------------------------'\n", | |
| "22928: '------------------'\n", | |
| "24702: '-------------------------'\n", | |
| "42277: '--;'\n", | |
| "11311: '---------------'\n", | |
| "16352: ';\"'\n", | |
| "42451: '---------------------------------------------'\n", | |
| "35349: '--------------------------------------'\n", | |
| "10521: '--------------'\n", | |
| "9962: '----------'\n", | |
| "43657: ';{\\\\'\n", | |
| "2917: '////'\n", | |
| "37041: '------------------------------------------------------------------------'\n", | |
| "48151: '-------------------------------------------------------'\n", | |
| "39481: '//!'\n", | |
| "22866: ';}'\n", | |
| "817: '##'\n", | |
| "47332: '----------------------------------------------------'\n", | |
| "44391: '#{$'\n", | |
| "16525: '----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'\n", | |
| "4118: '###'\n", | |
| "47726: \"#'\"\n", | |
| "30624: ';/'\n", | |
| "315: '----'\n", | |
| "40904: '--------------------------------------------'\n", | |
| "36311: '---------------------------------------'\n", | |
| "2912: '------------'\n", | |
| "26043: '------------------------'\n", | |
| "7405: ';\\\\'\n", | |
| "35000: '-------------------------------------'\n", | |
| "26836: '--\"'\n", | |
| "23380: '-----------------'\n", | |
| "23796: '-------------------'\n", | |
| "47584: '---------------------------------------------------'\n", | |
| "30282: '---------------------------------'\n", | |
| "6675: '########'\n", | |
| "9998: '-----------'\n", | |
| "9032: '////////////////'\n", | |
| "43067: '----------------------------------------------------------------------'\n", | |
| "50001: '--['\n", | |
| "33585: '-----------------------------------'\n", | |
| "38944: '-----------------------------------------'\n", | |
| "13011: ';\">'\n", | |
| "1532: '---'\n", | |
| "6154: '------------------------------------------------'\n", | |
| "34638: '------------------------------------'\n", | |
| "6846: '-------'\n", | |
| "10428: '-------------'\n", | |
| "7040: '-----'\n", | |
| "573: '----------------'\n", | |
| "13309: ';&'\n", | |
| "22002: '#####'\n", | |
| "16985: '////////////////////////////////'\n", | |
| "5071: '////////'\n", | |
| "7078: '--------------------------------------------------------------------------------------------------------------------------------'\n", | |
| "25916: '-----------------------'\n", | |
| "48904: '------------------------------------------------------'\n", | |
| "36739: ';,'\n", | |
| "28388: '---------------------------'\n", | |
| "47632: '---|---|---'\n", | |
| "39421: '------------------------------------------'\n", | |
| "4485: '------'\n", | |
| "36960: '----------------------------------------'\n", | |
| "43500: '-----------------------------------------------'\n", | |
| "13143: ';</'\n", | |
| "42040: '----------------------------------------------'\n", | |
| "28511: ';\\\\;'\n", | |
| "27396: '#,'\n", | |
| "15879: '--------------------'\n", | |
| "33250: '----------------------------------'\n", | |
| "27800: '--------------------------'\n", | |
| "33301: '--**'\n", | |
| "32107: '-------------------------------'\n", | |
| "23130: '----------------------'\n", | |
| "11890: '################'\n", | |
| "37446: '////////////////////////////////////////////////////////////////'\n", | |
| "10638: '///'\n", | |
| "20782: '---|---'\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "voc_roberta = AutoTokenizer.from_pretrained('roberta-large').get_vocab()\n", | |
| "\n", | |
| "for k, index in voc_roberta.items():\n", | |
| " if is_comment(k):\n", | |
| " print(f\"{index}: {repr(k)}\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "u-EMCfnaAQtx", | |
| "outputId": "2f200689-9827-4170-9d41-a7abf6b1d170" | |
| }, | |
| "execution_count": 6, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "49216: '////////////////////////////////'\n", | |
| "31175: '---------------'\n", | |
| "46580: '------------------------'\n", | |
| "49308: '----------------------------------------------------------------'\n", | |
| "49104: '///'\n", | |
| "49198: '--------'\n", | |
| "48803: ';}'\n", | |
| "45072: '----------'\n", | |
| "49625: '-->'\n", | |
| "50012: ';;;;'\n", | |
| "49283: '////////'\n", | |
| "49296: '////////////////'\n", | |
| "46156: '------------'\n", | |
| "44516: '------'\n", | |
| "49599: '--------------------------------------------------------'\n", | |
| "49909: ';;;;;;;;'\n", | |
| "47655: '--------------------'\n", | |
| "49629: '####'\n", | |
| "49183: '-----------'\n", | |
| "48342: '##'\n", | |
| "48712: '-------'\n", | |
| "44259: '----'\n", | |
| "39550: '-------------'\n", | |
| "47826: '---------'\n", | |
| "46939: '--------------------------------'\n", | |
| "41110: '--------------'\n", | |
| "48134: '###'\n", | |
| "50072: ';;;;;;;;;;;;'\n", | |
| "49255: '#$'\n", | |
| "49674: '################################'\n", | |
| "49727: '########'\n", | |
| "46343: '-----'\n", | |
| "48900: '////'\n", | |
| "42777: ';\"'\n", | |
| "49806: '################'\n", | |
| "48640: ';;'\n", | |
| "50065: '#$#$'\n", | |
| "24965: '---'\n", | |
| "24524: '----------------'\n", | |
| "49374: '------------------------------------------------'\n", | |
| "49903: '--+'\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment