simin75simin · August 4, 2024 10:04
diff --git a/peoeis.ipynb b/peoeis.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<>:2: SyntaxWarning: invalid escape sequence '\\s'\n",
      "<>:2: SyntaxWarning: invalid escape sequence '\\s'\n",
      "C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_9396\\1673588957.py:2: SyntaxWarning: invalid escape sequence '\\s'\n",
      "  with open('data\\solved_problem_ids.txt','r') as f:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(151, ['1', '2', '3', '6', '5'])"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# with open('data\\solved_problem_ids.txt','r') as f:\n",
    "#     solved_problem_ids = f.read().splitlines()\n",
    "# solved_problem_ids = [x for x in solved_problem_ids if x.isnumeric()]\n",
    "# len(solved_problem_ids), solved_problem_ids[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e1b8d517af9f46a78a6b90cbd2277271",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# # crawl solved solutions....just dont post it poblicly...\n",
    "# cookies = {'PHPSESSID': 'YOUR_PHPSE'}\n",
    "# for i in tqdm(solved_problem_ids[-2:]):\n",
    "#     attempt=0\n",
    "#     while attempt<5:\n",
    "#         try:\n",
    "#             u=f'https://projecteuler.net/thread={i};page=1'\n",
    "#             r=requests.get(u, cookies=cookies)\n",
    "#             if r.status_code != 200:\n",
    "#                 print(f'Error: {r.status_code} {u}')\n",
    "#                 continue\n",
    "#             idx=r.text.rfind('page=')\n",
    "#             if idx==-1:\n",
    "#                 pageno=1\n",
    "#             else:\n",
    "#                 pageno=-1\n",
    "#                 for j in range(idx+5,idx+10):\n",
    "#                     if not r.text[j].isnumeric():\n",
    "#                         pageno=int(r.text[idx+5:j])\n",
    "#                         break\n",
    "#             for p in range(1,pageno+1):\n",
    "#                 u=f'https://projecteuler.net/thread={i};page={str(p)}'\n",
    "#                 r=requests.get(u, cookies=cookies)\n",
    "#                 if r.status_code != 200:\n",
    "#                     print(f'Error: {r.status_code} {u}')\n",
    "#                     continue\n",
    "#                 with open(f'results/problem_{i}_page_{p}.html','w') as f:\n",
    "#                     f.write(r.text)\n",
    "#             break\n",
    "#         except Exception as e:\n",
    "#             attempt+=1\n",
    "#             print(f'Error: {e} {u}')\n",
    "#             continue"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# problem stmt crawler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# crawl problem statements\n",
    "for i in tqdm(solved_problem_ids):\n",
    "    attempt=0\n",
    "    while attempt<5:\n",
    "        try:\n",
    "            u=f'https://projecteuler.net/minimal={i}\n",
    "            r=requests.get(u, cookies=cookies)\n",
    "            with open(f'problem_statements_html/problem_{i}.html','w') as f:\n",
    "                f.write(r.text)\n",
    "            break\n",
    "        except Exception as e:\n",
    "            attempt+=1\n",
    "            print(f'Error: {e} {u}')\n",
    "            continue"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# oeis check..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import time\n",
    "from numpy.random import choice\n",
    "import random\n",
    "\n",
    "def extract_big_integers(input_string):\n",
    "    # Regular expression to find integers with more than 6 digits\n",
    "    pattern = r'\\b\\d{7,}\\b'\n",
    "    \n",
    "    # Find all matches in the input string\n",
    "    matches = re.findall(pattern, input_string)\n",
    "    \n",
    "    # Convert matches to integers\n",
    "    big_integers = [int(match) for match in matches]\n",
    "    \n",
    "    return big_integers\n",
    "\n",
    "def get_oeis(problem):\n",
    "    path=f'problem_statements_html\\problem_{problem}.html'\n",
    "\n",
    "    with open(path, 'r',encoding='cp1252') as file:\n",
    "        input_string = file.read()\n",
    "    big_integers = extract_big_integers(input_string)\n",
    "    if len(big_integers)>5:\n",
    "        big_integers=choice(big_integers, 5)\n",
    "\n",
    "    result=''\n",
    "    for i in big_integers:\n",
    "        url=f'https://oeis.org/search?q={i}&go=Search'\n",
    "        response = requests.get(url)\n",
    "        html = response.text\n",
    "        if \"Sorry, but the terms do not match anything in the table.\" in html:\n",
    "            continue\n",
    "        if not html:\n",
    "            print('NOTHING')\n",
    "            time.sleep(5)\n",
    "        result+=html\n",
    "        time.sleep(random.uniform(0.5, 1.5))\n",
    "\n",
    "    if not result:\n",
    "        return\n",
    "    \n",
    "    with open(f'oeis/oeis_{problem}.html', 'w',encoding='utf-8') as file:\n",
    "        file.write(result)\n",
    "\n",
    "from tqdm.notebook import tqdm\n",
    "for i in tqdm(range(101, 901)):\n",
    "    attempts=0\n",
    "    while attempts<5:\n",
    "        try:\n",
    "            get_oeis(i)\n",
    "            break\n",
    "        except Exception as e:\n",
    "            attempts+=1\n",
    "            print(f\"Error in problem {i}: {e}, attempt {attempts}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"<>:2: SyntaxWarning: invalid escape sequence '\\s'\n",
	"<>:2: SyntaxWarning: invalid escape sequence '\\s'\n",
	"C:\\Users\\Administrator\\AppData\\Local\\Temp\\ipykernel_9396\\1673588957.py:2: SyntaxWarning: invalid escape sequence '\\s'\n",
	" with open('data\\solved_problem_ids.txt','r') as f:\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"(151, ['1', '2', '3', '6', '5'])"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import pandas as pd\n",
	"# with open('data\\solved_problem_ids.txt','r') as f:\n",
	"# solved_problem_ids = f.read().splitlines()\n",
	"# solved_problem_ids = [x for x in solved_problem_ids if x.isnumeric()]\n",
	"# len(solved_problem_ids), solved_problem_ids[:5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "e1b8d517af9f46a78a6b90cbd2277271",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	" 0%\| \| 0/2 [00:00<?, ?it/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"# # crawl solved solutions....just dont post it poblicly...\n",
	"# cookies = {'PHPSESSID': 'YOUR_PHPSE'}\n",
	"# for i in tqdm(solved_problem_ids[-2:]):\n",
	"# attempt=0\n",
	"# while attempt<5:\n",
	"# try:\n",
	"# u=f'https://projecteuler.net/thread={i};page=1'\n",
	"# r=requests.get(u, cookies=cookies)\n",
	"# if r.status_code != 200:\n",
	"# print(f'Error: {r.status_code} {u}')\n",
	"# continue\n",
	"# idx=r.text.rfind('page=')\n",
	"# if idx==-1:\n",
	"# pageno=1\n",
	"# else:\n",
	"# pageno=-1\n",
	"# for j in range(idx+5,idx+10):\n",
	"# if not r.text[j].isnumeric():\n",
	"# pageno=int(r.text[idx+5:j])\n",
	"# break\n",
	"# for p in range(1,pageno+1):\n",
	"# u=f'https://projecteuler.net/thread={i};page={str(p)}'\n",
	"# r=requests.get(u, cookies=cookies)\n",
	"# if r.status_code != 200:\n",
	"# print(f'Error: {r.status_code} {u}')\n",
	"# continue\n",
	"# with open(f'results/problem_{i}_page_{p}.html','w') as f:\n",
	"# f.write(r.text)\n",
	"# break\n",
	"# except Exception as e:\n",
	"# attempt+=1\n",
	"# print(f'Error: {e} {u}')\n",
	"# continue"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# problem stmt crawler"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# crawl problem statements\n",
	"for i in tqdm(solved_problem_ids):\n",
	" attempt=0\n",
	" while attempt<5:\n",
	" try:\n",
	" u=f'https://projecteuler.net/minimal={i}\n",
	" r=requests.get(u, cookies=cookies)\n",
	" with open(f'problem_statements_html/problem_{i}.html','w') as f:\n",
	" f.write(r.text)\n",
	" break\n",
	" except Exception as e:\n",
	" attempt+=1\n",
	" print(f'Error: {e} {u}')\n",
	" continue"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# oeis check..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import requests\n",
	"from bs4 import BeautifulSoup\n",
	"import re\n",
	"import time\n",
	"from numpy.random import choice\n",
	"import random\n",
	"\n",
	"def extract_big_integers(input_string):\n",
	" # Regular expression to find integers with more than 6 digits\n",
	" pattern = r'\\b\\d{7,}\\b'\n",
	" \n",
	" # Find all matches in the input string\n",
	" matches = re.findall(pattern, input_string)\n",
	" \n",
	" # Convert matches to integers\n",
	" big_integers = [int(match) for match in matches]\n",
	" \n",
	" return big_integers\n",
	"\n",
	"def get_oeis(problem):\n",
	" path=f'problem_statements_html\\problem_{problem}.html'\n",
	"\n",
	" with open(path, 'r',encoding='cp1252') as file:\n",
	" input_string = file.read()\n",
	" big_integers = extract_big_integers(input_string)\n",
	" if len(big_integers)>5:\n",
	" big_integers=choice(big_integers, 5)\n",
	"\n",
	" result=''\n",
	" for i in big_integers:\n",
	" url=f'https://oeis.org/search?q={i}&go=Search'\n",
	" response = requests.get(url)\n",
	" html = response.text\n",
	" if \"Sorry, but the terms do not match anything in the table.\" in html:\n",
	" continue\n",
	" if not html:\n",
	" print('NOTHING')\n",
	" time.sleep(5)\n",
	" result+=html\n",
	" time.sleep(random.uniform(0.5, 1.5))\n",
	"\n",
	" if not result:\n",
	" return\n",
	" \n",
	" with open(f'oeis/oeis_{problem}.html', 'w',encoding='utf-8') as file:\n",
	" file.write(result)\n",
	"\n",
	"from tqdm.notebook import tqdm\n",
	"for i in tqdm(range(101, 901)):\n",
	" attempts=0\n",
	" while attempts<5:\n",
	" try:\n",
	" get_oeis(i)\n",
	" break\n",
	" except Exception as e:\n",
	" attempts+=1\n",
	" print(f\"Error in problem {i}: {e}, attempt {attempts}\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}