Last active
November 14, 2017 16:03
-
-
Save warenlg/93a42f56582ace1635c255b680dd5de3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"%pylab inline\n", | |
"import importlib\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as pl\n", | |
"import pyarrow.parquet as pq\n", | |
"import os\n", | |
"from collections import Counter\n", | |
"from operator import eq\n", | |
"from bblfsh.sdkversion import VERSION" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# UASTs extraction with the source{d} engine" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Uasts collected from the 100-siva-files dataset located in /data/siva/100-java\n", | |
"PATH_TO_UASTS = \"/home/waren/sourced/science3_local/code_duplication/100_uasts/\"\n", | |
"Node = importlib.import_module(\"bblfsh.gopkg.in.bblfsh.sdk.%s.uast.generated_pb2\" % VERSION).Node" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"nb of uasts : 200\n" | |
] | |
} | |
], | |
"source": [ | |
"uasts = []\n", | |
"for root, d, files in os.walk(PATH_TO_UASTS):\n", | |
" for f in files:\n", | |
" path_to_uast = os.path.join(root, f)\n", | |
" uasts.append(Node.FromString(pq.read_table(path_to_uast)[6].data.to_pylist()[0][0]))\n", | |
"\n", | |
"print(\"nb of uasts :\", len(uasts))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_features(uasts):\n", | |
" features = []\n", | |
" for uast in uasts:\n", | |
" queue = [uast]\n", | |
" while queue:\n", | |
" child = queue.pop(0)\n", | |
" queue.extend(child.children)\n", | |
" features.append((tuple(sort(child.roles)), len(child.children)))\n", | |
" return features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total number of nodes : 387967\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[((34, 57), 9),\n", | |
" ((18,), 1),\n", | |
" ((19, 41, 42), 1),\n", | |
" ((19, 41, 42), 1),\n", | |
" ((19, 41, 42), 1),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((19, 60), 2)]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"features = get_features(uasts)\n", | |
"print(\"total number of nodes :\", len(features))\n", | |
"features[:10]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Quantization of the number of children" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"list_nb_children = []\n", | |
"list_roles = []\n", | |
"for feature in features:\n", | |
" list_roles.append(feature[0])\n", | |
" list_nb_children.append(feature[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAD8CAYAAACLrvgBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFt5JREFUeJzt3XGsXvV93/H3Z3ZJk3QJJtwxapvZbZxKDtIW4gZP6ao0\nbsCEqmYSSY224WZWvC3QpVOlYLI/qJIgmS4rLVrC5MYeJspwEE2LVZy5HqHLJg3iS0gBQxh3DgnX\nAuxgB5pFCXPy3R/Pz9mTy72+J/e59nN97/slXT3nfM/vnPM7OsIfzjm/5zmpKiRJ6uJvDbsDkqSz\nh6EhSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTNDQ5LU2eJhd2C2nX/++bVixYphd0OS\nzioPP/zwt6tqZLp28y40VqxYwejo6LC7IUlnlSTf7NLO21OSpM4MDUlSZ4aGJKkzQ0OS1JmhIUnq\nzNCQJHVmaEiSOjM0JEmdGRqSpM7m3TfCB7Fi632d2j2z7crT3BNJmpu80pAkdWZoSJI6MzQkSZ0Z\nGpKkzqYNjSQ7kxxJ8viE+u8k+XqSg0n+oK9+Y5KxJE8lubyvvr7VxpJs7auvTPJQq38+yTmt/po2\nP9aWr5iNA5YkzVyXK407gPX9hSS/BmwA/n5VvRX4ZKuvBjYCb23rfDrJoiSLgE8BVwCrgWtaW4Bb\ngFur6s3AcWBzq28Gjrf6ra2dJGmIpg2NqvoycGxC+V8B26rqB63NkVbfAOyuqh9U1TeAMeAd7W+s\nqg5V1SvAbmBDkgDvBu5p6+8Crurb1q42fQ+wrrWXJA3JTJ9pvAX4R+220X9L8sutvhR4tq/deKtN\nVX8T8J2qOjGh/hPbastfau0lSUMy0y/3LQbOA9YCvwzcneQXZq1XP6UkW4AtABdddNGwuiFJ895M\nrzTGgS9Uz1eAHwHnA4eB5X3tlrXaVPUXgXOTLJ5Qp3+dtvyNrf2rVNX2qlpTVWtGRqZ9L7okaYZm\nGhp/DvwaQJK3AOcA3wb2ABvbyKeVwCrgK8ABYFUbKXUOvYfle6qqgAeAq9t2NwH3tuk9bZ62/Eut\nvSRpSKa9PZXkLuBdwPlJxoGbgJ3AzjYM9xVgU/sH/WCSu4EngBPAdVX1w7ad64F9wCJgZ1UdbLu4\nAdid5BPAI8COVt8BfDbJGL0H8Rtn4XglSQOYNjSq6popFv3TKdrfDNw8SX0vsHeS+iF6o6sm1r8P\nvG+6/kmSzhy/ES5J6szQkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6szQkCR1\nZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktTZtKGRZGeSI+0tfROX/V6SSnJ+m0+S25KMJXk0\nySV9bTclebr9beqrvz3JY22d25Kk1c9Lsr+1359kyewcsiRpprpcadwBrJ9YTLIcuAz4Vl/5Cnrv\nBV8FbAFub23Po/ea2EvpvaXvpr4QuB34YN96J/e1Fbi/qlYB97d5SdIQTRsaVfVleu/onuhW4CNA\n9dU2AHdWz4PAuUkuBC4H9lfVsao6DuwH1rdlb6iqB9s7xu8Erurb1q42vauvLkkakhk900iyAThc\nVX89YdFS4Nm++fFWO1V9fJI6wAVV9Vybfh64YCZ9lSTNnsU/7QpJXgd8lN6tqTOiqipJTbU8yRZ6\nt8O46KKLzlS3JGnBmcmVxi8CK4G/TvIMsAz4apK/CxwGlve1XdZqp6ovm6QO8EK7fUX7PDJVh6pq\ne1Wtqao1IyMjMzgkSVIXP3VoVNVjVfV3qmpFVa2gd0vpkqp6HtgDXNtGUa0FXmq3mPYBlyVZ0h6A\nXwbsa8teTrK2jZq6Fri37WoPcHKU1aa+uiRpSLoMub0L+J/ALyUZT7L5FM33AoeAMeBPgA8BVNUx\n4OPAgfb3sVajtflMW+d/A19s9W3Ae5I8Dfx6m5ckDdG0zzSq6ppplq/omy7guina7QR2TlIfBS6e\npP4isG66/kmSzhy/ES5J6szQkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6szQ\nkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktRZlzf37UxyJMnjfbV/l+TrSR5N8mdJzu1b\ndmOSsSRPJbm8r76+1caSbO2rr0zyUKt/Psk5rf6aNj/Wlq+YrYOWJM3MtG/uA+4A/gNwZ19tP3Bj\nVZ1IcgtwI3BDktXARuCtwM8D/zXJW9o6nwLeQ++d4geS7KmqJ4BbgFuraneS/whsBm5vn8er6s1J\nNrZ2vzXY4c6OFVvv69TumW1XnuaeSNKZNe2VRlV9GTg2ofaXVXWizT4ILGvTG4DdVfWDqvoGvfd+\nv6P9jVXVoap6BdgNbEgS4N3APW39XcBVfdva1abvAda19pKkIZmNZxr/HPhim14KPNu3bLzVpqq/\nCfhOXwCdrP/Ettryl1r7V0myJcloktGjR48OfECSpMkNFBpJ/i1wAvjc7HRnZqpqe1Wtqao1IyMj\nw+yKJM1rXZ5pTCrJbwO/Aayrqmrlw8DyvmbLWo0p6i8C5yZZ3K4m+tuf3NZ4ksXAG1t7SdKQzOhK\nI8l64CPAb1bV9/oW7QE2tpFPK4FVwFeAA8CqNlLqHHoPy/e0sHkAuLqtvwm4t29bm9r01cCX+sJJ\nkjQE015pJLkLeBdwfpJx4CZ6o6VeA+xvz6YfrKp/WVUHk9wNPEHvttV1VfXDtp3rgX3AImBnVR1s\nu7gB2J3kE8AjwI5W3wF8NskYvQfxG2fheCVJA5g2NKrqmknKOyapnWx/M3DzJPW9wN5J6ofoja6a\nWP8+8L7p+idJOnP8RrgkqTNDQ5LUmaEhSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTND\nQ5LUmaEhSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJn04ZGkp1JjiR5vK92XpL9SZ5un0taPUlu\nSzKW5NEkl/Sts6m1fzrJpr7625M81ta5Le1VgFPtQ5I0PF2uNO4A1k+obQXur6pVwP1tHuAKeu8F\nXwVsAW6HXgDQe03spfTe0ndTXwjcDnywb7310+xDkjQk04ZGVX2Z3ju6+20AdrXpXcBVffU7q+dB\n4NwkFwKXA/ur6lhVHQf2A+vbsjdU1YNVVcCdE7Y12T4kSUMy02caF1TVc236eeCCNr0UeLav3Xir\nnao+Pkn9VPt4lSRbkowmGT169OgMDkeS1MXAD8LbFULNQl9mvI+q2l5Va6pqzcjIyOnsiiQtaDMN\njRfarSXa55FWPwws72u3rNVOVV82Sf1U+5AkDclMQ2MPcHIE1Cbg3r76tW0U1VrgpXaLaR9wWZIl\n7QH4ZcC+tuzlJGvbqKlrJ2xrsn1IkoZk8XQNktwFvAs4P8k4vVFQ24C7k2wGvgm8vzXfC7wXGAO+\nB3wAoKqOJfk4cKC1+1hVnXy4/iF6I7ReC3yx/XGKfUiShmTa0Kiqa6ZYtG6StgVcN8V2dgI7J6mP\nAhdPUn9xsn1IkobHb4RLkjozNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6szQkCR1ZmhIkjoz\nNCRJnRkakqTODA1JUmeGhiSpM0NDktSZoSFJ6myg0Ejyb5IcTPJ4kruS/GySlUkeSjKW5PNJzmlt\nX9Pmx9ryFX3bubHVn0pyeV99fauNJdk6SF8lSYObcWgkWQr8a2BNVV0MLAI2ArcAt1bVm4HjwOa2\nymbgeKvf2tqRZHVb763AeuDTSRYlWQR8CrgCWA1c09pKkoZk0NtTi4HXJlkMvA54Dng3cE9bvgu4\nqk1vaPO05evae8E3ALur6gdV9Q16r4p9R/sbq6pDVfUKsLu1lSQNyYxDo6oOA58EvkUvLF4CHga+\nU1UnWrNxYGmbXgo829Y90dq/qb8+YZ2p6pKkIRnk9tQSev/nvxL4eeD19G4vnXFJtiQZTTJ69OjR\nYXRBkhaEQW5P/Trwjao6WlX/F/gC8E7g3Ha7CmAZcLhNHwaWA7TlbwRe7K9PWGeq+qtU1faqWlNV\na0ZGRgY4JEnSqQwSGt8C1iZ5XXs2sQ54AngAuLq12QTc26b3tHna8i9VVbX6xja6aiWwCvgKcABY\n1UZjnUPvYfmeAforSRrQ4umbTK6qHkpyD/BV4ATwCLAduA/YneQTrbajrbID+GySMeAYvRCgqg4m\nuZte4JwArquqHwIkuR7YR29k1s6qOjjT/kqSBjfj0ACoqpuAmyaUD9Eb+TSx7feB902xnZuBmyep\n7wX2DtJHSdLs8RvhkqTODA1JUmeGhiSpM0NDktSZoSFJ6szQkCR1ZmhIkjozNCRJnRkakqTODA1J\nUmeGhiSpM0NDktSZoSFJ6szQkCR1ZmhIkjozNCRJnQ0UGknOTXJPkq8neTLJP0xyXpL9SZ5un0ta\n2yS5LclYkkeTXNK3nU2t/dNJNvXV357ksbbObe21spKkIRnozX3AHwP/paqubu/xfh3wUeD+qtqW\nZCuwFbgBuILe+79XAZcCtwOXJjmP3tv/1gAFPJxkT1Udb20+CDxE7w1+64EvDtjnM2bF1vs6t31m\n25WnsSeSNDtmfKWR5I3Ar9LeAV5Vr1TVd4ANwK7WbBdwVZveANxZPQ8C5ya5ELgc2F9Vx1pQ7AfW\nt2VvqKoHq6qAO/u2JUkagkFuT60EjgL/KckjST6T5PXABVX1XGvzPHBBm14KPNu3/nirnao+Pkld\nkjQkg4TGYuAS4Paqehvwf+jdivqxdoVQA+yjkyRbkowmGT169Ojp3p0kLViDhMY4MF5VD7X5e+iF\nyAvt1hLt80hbfhhY3rf+slY7VX3ZJPVXqartVbWmqtaMjIwMcEiSpFOZcWhU1fPAs0l+qZXWAU8A\ne4CTI6A2Afe26T3AtW0U1VrgpXYbax9wWZIlbaTVZcC+tuzlJGvbqKlr+7YlSRqCQUdP/Q7wuTZy\n6hDwAXpBdHeSzcA3gfe3tnuB9wJjwPdaW6rqWJKPAwdau49V1bE2/SHgDuC19EZNnTUjpyRpPhoo\nNKrqa/SGyk60bpK2BVw3xXZ2AjsnqY8CFw/SR0nS7PEb4ZKkzgwNSVJnhoYkqTNDQ5LUmaEhSerM\n0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTNDQ5LUmaEhSerM0JAkdWZoSJI6MzQkSZ0NHBpJ\nFiV5JMlftPmVSR5KMpbk8+2tfiR5TZsfa8tX9G3jxlZ/KsnlffX1rTaWZOugfZUkDWY2rjQ+DDzZ\nN38LcGtVvRk4Dmxu9c3A8Va/tbUjyWpgI/BWYD3w6RZEi4BPAVcAq4FrWltJ0pAMFBpJlgFXAp9p\n8wHeDdzTmuwCrmrTG9o8bfm61n4DsLuqflBV36D3DvF3tL+xqjpUVa8Au1tbSdKQDHql8UfAR4Af\ntfk3Ad+pqhNtfhxY2qaXAs8CtOUvtfY/rk9YZ6r6qyTZkmQ0yejRo0cHPCRJ0lRmHBpJfgM4UlUP\nz2J/ZqSqtlfVmqpaMzIyMuzuSNK8tXiAdd8J/GaS9wI/C7wB+GPg3CSL29XEMuBwa38YWA6MJ1kM\nvBF4sa9+Uv86U9UlSUMw4yuNqrqxqpZV1Qp6D7K/VFX/BHgAuLo12wTc26b3tHna8i9VVbX6xja6\naiWwCvgKcABY1UZjndP2sWem/ZUkDW6QK42p3ADsTvIJ4BFgR6vvAD6bZAw4Ri8EqKqDSe4GngBO\nANdV1Q8BklwP7AMWATur6uBp6K8kqaNZCY2q+ivgr9r0IXojnya2+T7wvinWvxm4eZL6XmDvbPRR\nkjQ4vxEuSerM0JAkdWZoSJI6MzQkSZ0ZGpKkzgwNSVJnhoYkqTNDQ5LUmaEhSerM0JAkdXY6fntK\nM7Bi632d2j2z7crT3BNJmppXGpKkzgwNSVJnhoYkqTNDQ5LU2SDvCF+e5IEkTyQ5mOTDrX5ekv1J\nnm6fS1o9SW5LMpbk0SSX9G1rU2v/dJJNffW3J3msrXNbkgxysJKkwQxypXEC+L2qWg2sBa5LshrY\nCtxfVauA+9s8wBX0XuW6CtgC3A69kAFuAi6l9/Kmm04GTWvzwb711g/QX0nSgAZ5R/hzVfXVNv03\nwJPAUmADsKs12wVc1aY3AHdWz4PAuUkuBC4H9lfVsao6DuwH1rdlb6iqB9u7xO/s25YkaQhm5ZlG\nkhXA24CHgAuq6rm26Hnggja9FHi2b7XxVjtVfXySuiRpSAYOjSQ/B/wp8LtV9XL/snaFUIPuo0Mf\ntiQZTTJ69OjR0707SVqwBgqNJD9DLzA+V1VfaOUX2q0l2ueRVj8MLO9bfVmrnaq+bJL6q1TV9qpa\nU1VrRkZGBjkkSdIpDDJ6KsAO4Mmq+sO+RXuAkyOgNgH39tWvbaOo1gIvtdtY+4DLkixpD8AvA/a1\nZS8nWdv2dW3ftiRJQzDIb0+9E/hnwGNJvtZqHwW2AXcn2Qx8E3h/W7YXeC8wBnwP+ABAVR1L8nHg\nQGv3sao61qY/BNwBvBb4YvuTJA3JjEOjqv4HMNX3JtZN0r6A66bY1k5g5yT1UeDimfZRkjS7/Ea4\nJKkzQ0OS1JmhIUnqzNCQJHVmaEiSOjM0JEmdGRqSpM4MDUlSZ4aGJKkzQ0OS1JmhIUnqzNCQJHVm\naEiSOjM0JEmdGRqSpM4GeQmThmDF1vs6tXtm25WnuSeSFqI5f6WRZH2Sp5KMJdk67P5I0kI2p0Mj\nySLgU8AVwGrgmiSrh9srSVq45vrtqXcAY1V1CCDJbmAD8MRQe3UW8DaWpNNhrofGUuDZvvlx4NIh\n9WVemu1wMazmJs+LZstcD41OkmwBtrTZ7yZ5aoabOh/49uz0as77qY41t8zuzmd7e6fgOf0pnMHz\nMgjP6enx97o0muuhcRhY3je/rNV+QlVtB7YPurMko1W1ZtDtnA0WyrEulOOEhXOsC+U4YW4e65x+\nEA4cAFYlWZnkHGAjsGfIfZKkBWtOX2lU1Ykk1wP7gEXAzqo6OORuSdKCNadDA6Cq9gJ7z9DuBr7F\ndRZZKMe6UI4TFs6xLpTjhDl4rKmqYfdBknSWmOvPNCRJc4ih0SyUnytJ8kySx5J8LcnosPszm5Ls\nTHIkyeN9tfOS7E/ydPtcMsw+zpYpjvX3kxxu5/ZrSd47zD7OhiTLkzyQ5IkkB5N8uNXn1Xk9xXHO\nuXPq7Sl+/HMl/wt4D70vEB4ArqmqeffN8yTPAGuqat6Nc0/yq8B3gTur6uJW+wPgWFVta/8zsKSq\nbhhmP2fDFMf6+8B3q+qTw+zbbEpyIXBhVX01yd8GHgauAn6beXReT3Gc72eOnVOvNHp+/HMlVfUK\ncPLnSnQWqaovA8cmlDcAu9r0Lnr/IZ71pjjWeaeqnquqr7bpvwGepPdLEfPqvJ7iOOccQ6Nnsp8r\nmZMnbBYU8JdJHm7fpJ/vLqiq59r088AFw+zMGXB9kkfb7auz+pbNRElWAG8DHmIen9cJxwlz7Jwa\nGgvPr1TVJfR+Ofi6dptjQajevdj5fD/2duAXgX8APAf8++F2Z/Yk+TngT4HfraqX+5fNp/M6yXHO\nuXNqaPR0+rmS+aCqDrfPI8Cf0bs1N5+90O4Xn7xvfGTI/TltquqFqvphVf0I+BPmyblN8jP0/iH9\nXFV9oZXn3Xmd7Djn4jk1NHoWxM+VJHl9e8hGktcDlwGPn3qts94eYFOb3gTcO8S+nFYn/xFt/jHz\n4NwmCbADeLKq/rBv0bw6r1Md51w8p46eatpQtj/i//9cyc1D7tKsS/IL9K4uoPdrAP95Ph1nkruA\nd9H7ZdAXgJuAPwfuBi4Cvgm8v6rO+gfIUxzru+jdxijgGeBf9N33Pysl+RXgvwOPAT9q5Y/Su98/\nb87rKY7zGubYOTU0JEmdeXtKktSZoSFJ6szQkCR1ZmhIkjozNCRJnRkakqTODA1JUmeGhiSps/8H\njtx6qw9lqFcAAAAASUVORK5CYII=\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x7ff98516fbe0>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"values = array(list_nb_children)\n", | |
"_, bins, _ = hist(values[values < 30], bins=30)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"proportion of leaves : 0.43184858506006957\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"proportion of leaves :\", len([x for x in list_nb_children if x==0]) / len(list_nb_children))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"distinct number of children : 30\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"distinct number of children :\", len(set(list_nb_children)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"partition : [0, 1, 2, 3, 4, 6, 26, 78]\n" | |
] | |
} | |
], | |
"source": [ | |
"# quatization of the number of children\n", | |
"\n", | |
"def get_quantization(list_nb_children, fineness_of_partition=2): # fineness must be a positive integer\n", | |
"\n", | |
" nodes_in_partition = len(list_nb_children) / (100 * fineness_of_partition)\n", | |
" list_nb_children.sort()\n", | |
" unique_nb_children = list(set(list_nb_children))\n", | |
" partition = []\n", | |
" id_nb_children = 0\n", | |
"\n", | |
" while True:\n", | |
" nb_nodes_cumulate = 0\n", | |
" while (nb_nodes_cumulate < nodes_in_partition) and (unique_nb_children[id_nb_children] < max(list_nb_children)):\n", | |
" nb_nodes_cumulate += len([x for x in list_nb_children if x == unique_nb_children[id_nb_children]])\n", | |
" id_nb_children += 1\n", | |
" if unique_nb_children[id_nb_children] != max(list_nb_children):\n", | |
" partition.append(unique_nb_children[id_nb_children - 1])\n", | |
" else:\n", | |
" partition.append(unique_nb_children[id_nb_children])\n", | |
" break\n", | |
" return partition\n", | |
"\n", | |
"print(\"partition :\", get_quantization(list_nb_children))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"Distribution of nodes based on the previous number of children partition :" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[0, 1[ : 167543 nodes\n", | |
"[1, 2[ : 122345 nodes\n", | |
"[2, 3[ : 62102 nodes\n", | |
"[3, 4[ : 27518 nodes\n", | |
"[4, 6[ : 5387 nodes\n", | |
"[6, 26[ : 2874 nodes\n", | |
"[26, 78[ : 170 nodes\n" | |
] | |
} | |
], | |
"source": [ | |
"partition = get_quantization(list_nb_children)\n", | |
"for id, nb_c in enumerate(partition[:-1]):\n", | |
" nb_nodes = 0\n", | |
" for i in range(nb_c, partition[id+1]):\n", | |
" nb_nodes += len([x for x in list_nb_children if x == i])\n", | |
" print(\"[{}, {}[ : {} nodes\".format(nb_c, partition[id+1], nb_nodes))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"linear_values = np.arange(len(partition))\n", | |
"def stairvalue(value):\n", | |
" idx = np.searchsorted(partition, value, side=\"right\")\n", | |
" if value == max(partition):\n", | |
" return linear_values[idx-1] - 1\n", | |
" else:\n", | |
" return linear_values[idx-1]\n", | |
"\n", | |
"stairvalue(5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Merge rare combinations of roles to their nearest neighbor based on the Jaccard similiarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[((1, 18, 84, 85), 28441),\n", | |
" ((1, 18), 24241),\n", | |
" ((106,), 21252),\n", | |
" ((18,), 19002),\n", | |
" ((1, 6, 18), 18840),\n", | |
" ((1, 2, 18), 18688),\n", | |
" ((4, 18, 104), 16518),\n", | |
" ((1, 2, 18, 48, 84), 16461),\n", | |
" ((1, 18, 45, 47, 49, 84, 86), 15933),\n", | |
" ((18, 45, 84), 13943)]" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"roles_counts = Counter(roles for roles in list_roles)\n", | |
"roles_counts.most_common(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_roles2merge(roles_counts, threshold=100):\n", | |
" roles2merge = []\n", | |
" for roles in set(roles_counts):\n", | |
" if roles_counts[roles] <= threshold:\n", | |
" roles2merge.append(roles)\n", | |
" return roles2merge\n", | |
"\n", | |
"roles2merge = get_roles2merge(roles_counts)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_intersects(node_roles, bag_of_roles):\n", | |
" node_roles_intersect = {}\n", | |
" roles_candidate = [r for r in set(bag_of_roles) if r not in roles2merge + [node_roles]]\n", | |
" for roles in roles_candidate:\n", | |
" for role in roles:\n", | |
" if role in node_roles:\n", | |
" try:\n", | |
" node_roles_intersect[roles] += 1\n", | |
" except KeyError:\n", | |
" node_roles_intersect[roles] = 1\n", | |
" return node_roles_intersect\n", | |
"\n", | |
"def get_jaccard_similarities(node_roles, nearest_neighbor, val_inter):\n", | |
" jac_similarities = {}\n", | |
" for roles in nearest_neighbor:\n", | |
" jac_similarities[roles] = len(node_roles)\n", | |
" for role in roles:\n", | |
" if role not in node_roles:\n", | |
" jac_similarities[roles] += 1\n", | |
" jac_similarities[roles] = val_inter / jac_similarities[roles]\n", | |
" return jac_similarities" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(18, 45, 47, 49, 84, 86)" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def get_nearest_neighbor(node_roles, bag_of_roles):\n", | |
" node_roles_intersect = get_intersects(node_roles, bag_of_roles)\n", | |
"\n", | |
" if node_roles_intersect:\n", | |
" max_inter = max(node_roles_intersect.values())\n", | |
" nearest_neighbors = [tuple(sort(k)) for k, v in node_roles_intersect.items() if v == max_inter]\n", | |
"\n", | |
" jac_similarities = get_jaccard_similarities(node_roles, nearest_neighbors, max_inter)\n", | |
" max_jac_similarity = max(jac_similarities.values())\n", | |
" nearest_neighbors = [k for k, v in jac_similarities.items() if v == max_jac_similarity]\n", | |
" return nearest_neighbors[0]\n", | |
" else:\n", | |
" return None\n", | |
"\n", | |
"get_nearest_neighbor((1, 18, 45, 47, 49, 84, 86), roles_counts)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_filtered_features_per_uast(uasts):\n", | |
" filtered_features_per_uast = [[] for _ in range(len(uasts))]\n", | |
" for i in range(len(uasts)):\n", | |
" queue = [uasts[i]]\n", | |
" while queue:\n", | |
" child = queue.pop(0)\n", | |
" queue.extend(child.children)\n", | |
" roles = tuple(sort(child.roles))\n", | |
" if roles not in roles2merge:\n", | |
" filtered_features_per_uast[i].append((roles, stairvalue(len(child.children))))\n", | |
" elif get_nearest_neighbor(roles, set(roles_counts)):\n", | |
" filtered_features_per_uast[i].append((get_nearest_neighbor(roles, set(roles_counts)), stairvalue(len(child.children))))\n", | |
" return filtered_features_per_uast\n", | |
"\n", | |
"filtered_features_per_uast = get_filtered_features_per_uast(uasts)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"filtered_features = [f for features_uast in filtered_features_per_uast for f in features_uast] " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[((34, 57), 5),\n", | |
" ((18,), 1),\n", | |
" ((19, 41, 42), 1),\n", | |
" ((19, 41, 42), 1),\n", | |
" ((19, 41, 42), 1),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((1, 41, 45, 47), 2),\n", | |
" ((19, 60), 2)]" | |
] | |
}, | |
"execution_count": 62, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"filtered_features[:10]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# TF-IDF computation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(((18,), 1), 40),\n", | |
" (((106,), 0), 34),\n", | |
" (((1, 18), 0), 30),\n", | |
" (((4, 18, 104), 2), 24),\n", | |
" (((18, 88, 98, 103), 0), 24),\n", | |
" (((1, 6, 18), 0), 23),\n", | |
" (((1, 18, 84, 85), 1), 21),\n", | |
" (((1, 2, 18, 48, 84), 0), 19),\n", | |
" (((18, 45, 47, 49, 84, 86, 88, 98, 103), 0), 18),\n", | |
" (((1, 18, 45, 47, 49, 84, 86), 0), 16)]" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"features_TF = []\n", | |
"for i in range(len(uasts)):\n", | |
" features_TF.append(Counter(feature for feature in filtered_features_per_uast[i]))\n", | |
"features_TF[0].most_common(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": { | |
"collapsed": true, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"features_DF = {}\n", | |
"for feature in set(filtered_features):\n", | |
" for i in range(len(uasts)):\n", | |
" if feature in filtered_features_per_uast[i]:\n", | |
" try:\n", | |
" features_DF[feature] += 1\n", | |
" except:\n", | |
" features_DF[feature] = 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"TFIDF_weights = [{} for _ in range(len(uasts))]\n", | |
"for i in range(len(uasts)):\n", | |
" for feature in set(filtered_features):\n", | |
" TFIDF_weights[i][feature] = features_TF[i][feature] / features_DF[feature]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{((1, 2, 18), 0): 0.025,\n", | |
" ((1, 2, 18), 1): 0.0,\n", | |
" ((1, 2, 18, 48, 84), 0): 0.095,\n", | |
" ((1, 2, 18, 48, 84), 1): 0.0,\n", | |
" ((1, 4, 6, 18), 0): 0.0,\n", | |
" ((1, 4, 6, 18), 1): 0.0,\n", | |
" ((1, 4, 7, 18), 0): 0.0,\n", | |
" ((1, 4, 7, 18), 1): 0.0,\n", | |
" ((1, 6, 18), 0): 0.11616161616161616,\n", | |
" ((1, 6, 18), 1): 0.03,\n", | |
" ((1, 7, 18), 0): 0.010309278350515464,\n", | |
" ((1, 7, 18), 1): 0.0,\n", | |
" ((1, 18), 0): 0.15151515151515152,\n", | |
" ((1, 18), 1): 0.015,\n", | |
" ((1, 18, 19, 67, 69), 0): 0.0,\n", | |
" ((1, 18, 19, 67, 69), 2): 0.0,\n", | |
" ((1, 18, 41, 45, 47, 49), 0): 0.02040816326530612,\n", | |
" ((1, 18, 41, 45, 47, 49), 1): 0.025252525252525252,\n", | |
" ((1, 18, 45, 47, 49, 84, 86), 0): 0.08080808080808081,\n", | |
" ((1, 18, 45, 47, 49, 84, 86), 1): 0.0,\n", | |
" ((1, 18, 45, 84, 85), 0): 0.0,\n", | |
" ((1, 18, 45, 84, 85), 1): 0.00558659217877095,\n", | |
" ((1, 18, 45, 84, 85), 2): 0.011976047904191617,\n", | |
" ((1, 18, 45, 84, 85), 3): 0.01098901098901099,\n", | |
" ((1, 18, 48, 84), 1): 0.00510204081632653,\n", | |
" ((1, 18, 49, 50), 0): 0.0,\n", | |
" ((1, 18, 49, 50), 1): 0.0,\n", | |
" ((1, 18, 49, 50), 2): 0.0,\n", | |
" ((1, 18, 49, 50), 3): 0.0,\n", | |
" ((1, 18, 49, 84, 85, 86), 0): 0.0,\n", | |
" ((1, 18, 49, 84, 85, 86), 1): 0.008928571428571428,\n", | |
" ((1, 18, 49, 84, 86), 0): 0.013888888888888888,\n", | |
" ((1, 18, 49, 84, 86), 1): 0.015503875968992248,\n", | |
" ((1, 18, 50, 93), 0): 0.0,\n", | |
" ((1, 18, 50, 93), 1): 0.0,\n", | |
" ((1, 18, 60, 61), 0): 0.0,\n", | |
" ((1, 18, 60, 61), 1): 0.0,\n", | |
" ((1, 18, 61, 71), 0): 0.0,\n", | |
" ((1, 18, 67), 0): 0.011695906432748537,\n", | |
" ((1, 18, 67), 1): 0.0,\n", | |
" ((1, 18, 67, 69), 0): 0.011560693641618497,\n", | |
" ((1, 18, 67, 69), 1): 0.0,\n", | |
" ((1, 18, 84, 85), 0): 0.08080808080808081,\n", | |
" ((1, 18, 84, 85), 1): 0.105,\n", | |
" ((1, 19, 41, 100), 1): 0.0,\n", | |
" ((1, 19, 41, 100), 2): 0.0,\n", | |
" ((1, 41, 45, 47), 2): 0.025252525252525252,\n", | |
" ((1, 41, 45, 47), 3): 0.0,\n", | |
" ((1, 42, 43), 0): 0.015,\n", | |
" ((1, 42, 43), 1): 0.0,\n", | |
" ((1, 42, 44), 0): 0.0,\n", | |
" ((3, 4, 5, 18), 0): 0.0,\n", | |
" ((3, 4, 5, 18), 2): 0.00684931506849315,\n", | |
" ((3, 4, 5, 18, 60, 61), 2): 0.00510204081632653,\n", | |
" ((3, 4, 11, 15), 0): 0.005988023952095809,\n", | |
" ((3, 4, 11, 17), 0): 0.0,\n", | |
" ((3, 4, 11, 21), 0): 0.01020408163265306,\n", | |
" ((3, 4, 18, 35, 115), 0): 0.0,\n", | |
" ((3, 4, 18, 37, 115), 0): 0.0,\n", | |
" ((3, 4, 18, 38, 115), 0): 0.0,\n", | |
" ((3, 4, 18, 39, 49, 84, 86, 115), 0): 0.005649717514124294,\n", | |
" ((3, 4, 18, 39, 115), 0): 0.0,\n", | |
" ((3, 4, 19, 104), 3): 0.0,\n", | |
" ((3, 4, 20, 21, 116), 0): 0.005154639175257732,\n", | |
" ((3, 4, 20, 116), 0): 0.04081632653061224,\n", | |
" ((3, 4, 21, 26, 116), 0): 0.0,\n", | |
" ((3, 4, 21, 27, 116), 0): 0.0,\n", | |
" ((3, 4, 22, 116), 0): 0.0,\n", | |
" ((3, 4, 25, 116), 0): 0.0,\n", | |
" ((3, 4, 26, 116), 0): 0.0,\n", | |
" ((3, 4, 27, 116), 0): 0.026041666666666668,\n", | |
" ((3, 4, 35, 115), 0): 0.0,\n", | |
" ((4, 6, 18), 2): 0.0,\n", | |
" ((4, 6, 18), 3): 0.0,\n", | |
" ((4, 6, 18, 45, 84), 1): 0.0,\n", | |
" ((4, 6, 18, 45, 84), 2): 0.0,\n", | |
" ((4, 6, 18, 45, 84), 3): 0.0,\n", | |
" ((4, 6, 18, 49, 84, 86, 88, 98, 103), 0): 0.005649717514124294,\n", | |
" ((4, 6, 18, 49, 84, 86, 88, 98, 103), 1): 0.0,\n", | |
" ((4, 6, 18, 88, 95, 103), 0): 0.0,\n", | |
" ((4, 6, 18, 88, 95, 103), 3): 0.0,\n", | |
" ((4, 6, 18, 88, 95, 103), 4): 0.0,\n", | |
" ((4, 6, 18, 88, 98, 103), 0): 0.0,\n", | |
" ((4, 6, 18, 88, 98, 103), 1): 0.0,\n", | |
" ((4, 6, 18, 109), 2): 0.0,\n", | |
" ((4, 7, 18), 3): 0.0,\n", | |
" ((4, 7, 18, 45, 84), 1): 0.0,\n", | |
" ((4, 7, 18, 45, 84), 2): 0.0,\n", | |
" ((4, 7, 18, 45, 84), 3): 0.0,\n", | |
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 0): 0.0,\n", | |
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 1): 0.0,\n", | |
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 2): 0.008333333333333333,\n", | |
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 3): 0.0,\n", | |
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 4): 0.0,\n", | |
" ((4, 7, 18, 49, 84, 86, 88, 99, 103), 5): 0.0,\n", | |
" ((4, 7, 18, 88, 95, 103), 0): 0.0,\n", | |
" ((4, 7, 18, 88, 98, 103), 0): 0.0,\n", | |
" ((4, 7, 18, 88, 99, 103), 1): 0.0,\n", | |
" ((4, 7, 18, 88, 99, 103), 2): 0.0,\n", | |
" ((4, 7, 18, 88, 99, 103), 3): 0.0,\n", | |
" ((4, 7, 18, 88, 99, 103), 4): 0.0,\n", | |
" ((4, 7, 18, 88, 99, 103), 5): 0.0,\n", | |
" ((4, 18), 3): 0.005847953216374269,\n", | |
" ((4, 18, 45, 47, 49, 84, 86), 2): 0.0,\n", | |
" ((4, 18, 45, 47, 49, 84, 86), 3): 0.0,\n", | |
" ((4, 18, 49, 84, 86), 3): 0.005649717514124294,\n", | |
" ((4, 18, 50, 93), 3): 0.0,\n", | |
" ((4, 18, 60, 61), 3): 0.06565656565656566,\n", | |
" ((4, 18, 93, 102), 3): 0.0,\n", | |
" ((4, 18, 104), 2): 0.12,\n", | |
" ((4, 18, 104), 3): 0.0,\n", | |
" ((6, 18, 45, 84), 1): 0.0,\n", | |
" ((6, 18, 45, 84), 2): 0.005154639175257732,\n", | |
" ((6, 18, 45, 84), 3): 0.0,\n", | |
" ((6, 18, 88, 98, 103), 0): 0.044444444444444446,\n", | |
" ((6, 18, 88, 98, 103), 1): 0.0,\n", | |
" ((6, 18, 88, 99, 103), 1): 0.0,\n", | |
" ((6, 18, 88, 99, 103), 2): 0.005780346820809248,\n", | |
" ((6, 18, 88, 99, 103), 3): 0.0,\n", | |
" ((6, 18, 88, 99, 103), 4): 0.0,\n", | |
" ((6, 18, 88, 99, 103), 5): 0.0,\n", | |
" ((6, 18, 109), 2): 0.017142857142857144,\n", | |
" ((7, 18), 1): 0.0707070707070707,\n", | |
" ((7, 18), 2): 0.0,\n", | |
" ((7, 18), 3): 0.0,\n", | |
" ((7, 18, 45, 84), 1): 0.00510204081632653,\n", | |
" ((7, 18, 45, 84), 2): 0.025,\n", | |
" ((7, 18, 45, 84), 3): 0.015463917525773196,\n", | |
" ((7, 18, 45, 84), 4): 0.0,\n", | |
" ((7, 18, 45, 84), 5): 0.0,\n", | |
" ((7, 18, 67, 92), 2): 0.0,\n", | |
" ((7, 18, 88, 92, 103), 0): 0.005988023952095809,\n", | |
" ((7, 18, 88, 92, 103), 1): 0.0072992700729927005,\n", | |
" ((7, 18, 88, 92, 103), 2): 0.01639344262295082,\n", | |
" ((7, 18, 88, 92, 103), 3): 0.0,\n", | |
" ((7, 18, 88, 92, 103), 4): 0.0,\n", | |
" ((7, 18, 88, 92, 103), 5): 0.0,\n", | |
" ((7, 18, 88, 93, 103), 0): 0.012121212121212121,\n", | |
" ((7, 18, 88, 93, 103), 2): 0.0,\n", | |
" ((7, 18, 88, 93, 103), 4): 0.0,\n", | |
" ((7, 18, 88, 93, 103), 5): 0.0,\n", | |
" ((7, 18, 88, 93, 103), 6): 0.0,\n", | |
" ((7, 18, 88, 95, 103), 0): 0.0,\n", | |
" ((7, 18, 88, 98, 103), 0): 0.032520325203252036,\n", | |
" ((7, 18, 88, 99, 103), 2): 0.0,\n", | |
" ((7, 18, 88, 99, 103), 3): 0.012658227848101266,\n", | |
" ((7, 18, 88, 99, 103), 4): 0.0,\n", | |
" ((7, 18, 88, 99, 103), 5): 0.0,\n", | |
" ((7, 18, 109), 2): 0.015306122448979591,\n", | |
" ((11, 18, 60, 61, 109), 2): 0.0,\n", | |
" ((11, 18, 60, 61, 109), 3): 0.005988023952095809,\n", | |
" ((11, 18, 60, 61, 109), 4): 0.0,\n", | |
" ((18,), 1): 0.20202020202020202,\n", | |
" ((18,), 2): 0.0,\n", | |
" ((18,), 3): 0.0,\n", | |
" ((18, 19, 45, 67, 69, 84), 1): 0.0,\n", | |
" ((18, 19, 45, 67, 69, 84), 2): 0.0,\n", | |
" ((18, 45, 47, 49, 67, 84, 86, 92), 2): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86), 1): 0.006211180124223602,\n", | |
" ((18, 45, 47, 49, 84, 86), 2): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86), 3): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86), 4): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 0): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 1): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 2): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 3): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 4): 0.011363636363636364,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 92, 103), 5): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 95, 103), 0): 0.020202020202020204,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 95, 103), 5): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 98, 103), 0): 0.09090909090909091,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 99, 103), 1): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 99, 103), 2): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 88, 99, 103), 4): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 109), 1): 0.0,\n", | |
" ((18, 45, 47, 49, 84, 86, 109), 2): 0.03954802259887006,\n", | |
" ((18, 45, 48, 84), 1): 0.0,\n", | |
" ((18, 45, 48, 84), 2): 0.0,\n", | |
" ((18, 45, 48, 84), 3): 0.0,\n", | |
" ((18, 45, 48, 84), 4): 0.0,\n", | |
" ((18, 45, 49, 50, 84), 1): 0.0,\n", | |
" ((18, 45, 49, 84, 86), 2): 0.008771929824561403,\n", | |
" ((18, 45, 49, 84, 86), 3): 0.0,\n", | |
" ((18, 45, 50, 84, 93), 3): 0.0,\n", | |
" ((18, 45, 60, 61, 84), 2): 0.006944444444444444,\n", | |
" ((18, 45, 60, 61, 84), 3): 0.015873015873015872,\n", | |
" ((18, 45, 60, 61, 84), 4): 0.0,\n", | |
" ((18, 45, 67, 84), 1): 0.0,\n", | |
" ((18, 45, 67, 84), 2): 0.0,\n", | |
" ((18, 45, 67, 84), 3): 0.0,\n", | |
" ((18, 45, 67, 84), 4): 0.0,\n", | |
" ((18, 45, 84), 1): 0.01020408163265306,\n", | |
" ((18, 45, 84), 2): 0.0707070707070707,\n", | |
" ((18, 45, 84), 3): 0.04142011834319527,\n", | |
" ((18, 45, 84), 4): 0.005988023952095809,\n", | |
" ((18, 45, 84), 5): 0.0,\n", | |
" ((18, 48, 84, 88, 98, 103), 0): 0.006134969325153374,\n", | |
" ((18, 48, 84, 88, 98, 103), 1): 0.0,\n", | |
" ((18, 48, 84, 109), 2): 0.00847457627118644,\n", | |
" ((18, 49, 50, 88, 92, 103), 0): 0.0,\n", | |
" ((18, 49, 50, 88, 92, 103), 2): 0.0,\n", | |
" ((18, 49, 50, 88, 92, 103), 4): 0.0,\n", | |
" ((18, 49, 50, 88, 92, 103), 5): 0.0,\n", | |
" ((18, 49, 50, 88, 98, 103), 0): 0.0,\n", | |
" ((18, 49, 84, 86, 88, 98, 103), 0): 0.005917159763313609,\n", | |
" ((18, 49, 84, 86, 88, 98, 103), 1): 0.00909090909090909,\n", | |
" ((18, 50, 88, 92, 93, 103), 2): 0.0,\n", | |
" ((18, 50, 88, 92, 93, 103), 3): 0.0,\n", | |
" ((18, 50, 88, 93, 98, 103), 0): 0.0,\n", | |
" ((18, 50, 88, 93, 98, 103), 1): 0.0,\n", | |
" ((18, 50, 88, 93, 98, 103), 2): 0.0,\n", | |
" ((18, 67, 69, 88, 99, 103), 2): 0.0,\n", | |
" ((18, 67, 69, 88, 99, 103), 3): 0.0,\n", | |
" ((18, 67, 69, 88, 99, 103), 4): 0.0,\n", | |
" ((18, 67, 70, 109), 2): 0.0,\n", | |
" ((18, 67, 70, 109), 3): 0.0,\n", | |
" ((18, 88, 93, 98, 102, 103), 0): 0.0,\n", | |
" ((18, 88, 93, 98, 102, 103), 4): 0.0,\n", | |
" ((18, 88, 95, 103), 0): 0.03,\n", | |
" ((18, 88, 95, 103), 1): 0.0,\n", | |
" ((18, 88, 98, 103), 0): 0.12244897959183673,\n", | |
" ((18, 88, 98, 103), 1): 0.006060606060606061,\n", | |
" ((18, 88, 99, 103), 0): 0.0,\n", | |
" ((18, 88, 99, 103), 2): 0.0,\n", | |
" ((18, 88, 99, 103), 3): 0.0,\n", | |
" ((18, 88, 99, 103), 4): 0.0,\n", | |
" ((18, 88, 99, 103), 5): 0.0,\n", | |
" ((18, 109), 1): 0.075,\n", | |
" ((18, 109), 2): 0.005952380952380952,\n", | |
" ((18, 109), 5): 0.0,\n", | |
" ((19, 41, 42), 1): 0.015463917525773196,\n", | |
" ((19, 41, 42), 2): 0.0,\n", | |
" ((19, 41, 42), 3): 0.0,\n", | |
" ((19, 41, 42), 4): 0.0,\n", | |
" ((19, 60), 2): 0.05555555555555555,\n", | |
" ((19, 60), 3): 0.030612244897959183,\n", | |
" ((19, 67, 70), 3): 0.011560693641618497,\n", | |
" ((19, 67, 70), 4): 0.0,\n", | |
" ((19, 71), 2): 0.0,\n", | |
" ((19, 73), 0): 0.0,\n", | |
" ((19, 73), 1): 0.0,\n", | |
" ((19, 74), 0): 0.008771929824561403,\n", | |
" ((19, 74), 1): 0.0,\n", | |
" ((19, 78), 0): 0.0,\n", | |
" ((19, 78), 1): 0.015306122448979591,\n", | |
" ((19, 79, 80), 1): 0.0,\n", | |
" ((19, 79, 80), 2): 0.0,\n", | |
" ((19, 79, 80), 3): 0.0,\n", | |
" ((19, 79, 80), 4): 0.0,\n", | |
" ((19, 79, 80), 5): 0.0,\n", | |
" ((19, 79, 81), 3): 0.0,\n", | |
" ((19, 79, 81), 4): 0.0,\n", | |
" ((19, 79, 81), 5): 0.0,\n", | |
" ((19, 82), 0): 0.0,\n", | |
" ((19, 82), 1): 0.0,\n", | |
" ((19, 82), 2): 0.0,\n", | |
" ((19, 83), 1): 0.0,\n", | |
" ((19, 83), 2): 0.0,\n", | |
" ((19, 87), 0): 0.0,\n", | |
" ((19, 87), 1): 0.0,\n", | |
" ((34, 57), 4): 0.0,\n", | |
" ((34, 57), 5): 0.016129032258064516,\n", | |
" ((34, 57), 6): 0.0,\n", | |
" ((41, 45, 46), 1): 0.006944444444444444,\n", | |
" ((41, 45, 46), 2): 0.0,\n", | |
" ((41, 45, 46), 3): 0.0,\n", | |
" ((41, 45, 46), 4): 0.0,\n", | |
" ((41, 45, 46), 5): 0.020618556701030927,\n", | |
" ((41, 45, 46), 6): 0.0,\n", | |
" ((41, 45, 49, 50, 109), 1): 0.0,\n", | |
" ((41, 45, 49, 50, 109), 2): 0.0,\n", | |
" ((41, 45, 49, 50, 109), 3): 0.0,\n", | |
" ((41, 45, 49, 109), 0): 0.0,\n", | |
" ((41, 45, 49, 109), 1): 0.01020408163265306,\n", | |
" ((41, 45, 49, 109), 2): 0.010638297872340425,\n", | |
" ((41, 45, 49, 109), 3): 0.005988023952095809,\n", | |
" ((41, 45, 49, 109), 4): 0.0,\n", | |
" ((41, 45, 49, 109), 5): 0.0,\n", | |
" ((41, 45, 84, 109), 1): 0.0,\n", | |
" ((41, 46, 100), 1): 0.0,\n", | |
" ((41, 46, 100), 3): 0.0,\n", | |
" ((41, 46, 100), 4): 0.0,\n", | |
" ((41, 46, 100), 5): 0.0,\n", | |
" ((41, 46, 100), 6): 0.0,\n", | |
" ((41, 52, 100), 1): 0.0,\n", | |
" ((45, 47, 49, 84), 1): 0.0,\n", | |
" ((46, 60, 62), 1): 0.050505050505050504,\n", | |
" ((46, 60, 62), 2): 0.030303030303030304,\n", | |
" ((46, 60, 62), 3): 0.0,\n", | |
" ((46, 60, 62), 4): 0.005208333333333333,\n", | |
" ((46, 60, 62), 5): 0.0,\n", | |
" ((46, 60, 63), 1): 0.02040816326530612,\n", | |
" ((46, 60, 63), 2): 0.005917159763313609,\n", | |
" ((46, 60, 63), 3): 0.0,\n", | |
" ((46, 60, 63), 4): 0.01098901098901099,\n", | |
" ((46, 60, 63), 5): 0.0,\n", | |
" ((46, 67), 1): 0.005847953216374269,\n", | |
" ((46, 67), 2): 0.0,\n", | |
" ((46, 67), 3): 0.0,\n", | |
" ((46, 67), 4): 0.006944444444444444,\n", | |
" ((46, 67), 5): 0.0,\n", | |
" ((46, 71), 1): 0.0,\n", | |
" ((46, 71), 2): 0.0,\n", | |
" ((46, 71), 3): 0.0,\n", | |
" ((46, 71), 4): 0.0,\n", | |
" ((46, 71), 5): 0.0,\n", | |
" ((49, 84, 86, 108), 1): 0.009259259259259259,\n", | |
" ((49, 84, 86, 108), 2): 0.0,\n", | |
" ((106,), 0): 0.17,\n", | |
" ((108,), 1): 0.035,\n", | |
" ((108,), 2): 0.02857142857142857,\n", | |
" ((108,), 3): 0.0,\n", | |
" ((108,), 4): 0.0,\n", | |
" ((108,), 5): 0.005050505050505051}" | |
] | |
}, | |
"execution_count": 66, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"TFIDF_weights[0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Internal types extraction" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_internal_types(node):\n", | |
" bag_of_internal_types = []\n", | |
" for uast in uasts:\n", | |
" queue = [uast]\n", | |
" while queue:\n", | |
" child = queue.pop(0)\n", | |
" queue.extend(child.children)\n", | |
" bag_of_internal_types.append(child.internal_type)\n", | |
" return bag_of_internal_types" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('Name', 106009),\n", | |
" ('Attribute', 40977),\n", | |
" ('Call', 28867),\n", | |
" ('Str', 26104),\n", | |
" ('NoopLine', 21135),\n", | |
" ('Assign', 16518),\n", | |
" ('PreviousNoops', 13493),\n", | |
" ('Expr', 13400),\n", | |
" ('If.body', 8257),\n", | |
" ('If', 8257)]" | |
] | |
}, | |
"execution_count": 68, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"bag_of_internal_types = get_internal_types(uasts)\n", | |
"internal_types_counts = Counter(bag_of_internal_types)\n", | |
"internal_types_counts.most_common(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment