Skip to content

Instantly share code, notes, and snippets.

@ssghost
Last active December 10, 2024 08:32
Show Gist options
  • Save ssghost/df36ea84fc937346962d3d4f94d39ab7 to your computer and use it in GitHub Desktop.
Save ssghost/df36ea84fc937346962d3d4f94d39ab7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pyspark in ./.venv/lib/python3.12/site-packages (3.5.3)\n",
"Requirement already satisfied: dbldatagen in ./.venv/lib/python3.12/site-packages (0.4.0.post1)\n",
"Requirement already satisfied: Faker in ./.venv/lib/python3.12/site-packages (33.1.0)\n",
"Requirement already satisfied: py4j==0.10.9.7 in ./.venv/lib/python3.12/site-packages (from pyspark) (0.10.9.7)\n",
"Requirement already satisfied: python-dateutil>=2.4 in ./.venv/lib/python3.12/site-packages (from Faker) (2.9.0.post0)\n",
"Requirement already satisfied: typing-extensions in ./.venv/lib/python3.12/site-packages (from Faker) (4.12.2)\n",
"Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.4->Faker) (1.17.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install pyspark dbldatagen Faker"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: jmespath in ./.venv/lib/python3.12/site-packages (1.0.1)\n",
"Requirement already satisfied: numpy in ./.venv/lib/python3.12/site-packages (2.2.0)\n",
"Requirement already satisfied: pandas in ./.venv/lib/python3.12/site-packages (2.2.3)\n",
"Requirement already satisfied: requests in ./.venv/lib/python3.12/site-packages (2.32.3)\n",
"Collecting pyparsing\n",
" Downloading pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.12/site-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.12/site-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in ./.venv/lib/python3.12/site-packages (from requests) (3.4.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in ./.venv/lib/python3.12/site-packages (from requests) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.12/site-packages (from requests) (2.2.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in ./.venv/lib/python3.12/site-packages (from requests) (2024.8.30)\n",
"Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
"Downloading pyparsing-3.2.0-py3-none-any.whl (106 kB)\n",
"Installing collected packages: pyparsing\n",
"Successfully installed pyparsing-3.2.0\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install jmespath numpy pandas requests pyparsing"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "49a6ec16-c00d-44c9-a0f8-aa01af3a4bb9",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"from pyspark.sql.types import FloatType, IntegerType, StringType,TimestampType, DecimalType\n",
"from pyspark.sql.functions import col, date_format\n",
"import pyspark.sql.functions as F\n",
"import dbldatagen as dg\n",
"import re\n",
"import random\n",
"import requests\n",
"from string import ascii_letters\n",
"import numpy as np\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "642edfc0-d3fe-498b-b87d-2a3526b22ae7",
"showTitle": true,
"title": "1.1. Random string"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['CtEdmRVDrtqsF', 'JEvVTlzNQgAATu', 'zrwzWQKwsyLCg', 'ycfTAhchOILsNR', 'brLYCjNvtgGp', 'NfZTLCVECRWSn', 'LnfrGLOdHa', 'djCgsZHrDvHj', 'rLROIysqzHIq', 'crwqCPAagKCp']\n"
]
}
],
"source": [
"def create_random_string(num_characters: int) -> str:\n",
" return ''.join(random.choices(ascii_letters, k=num_characters))\n",
"\n",
"random_strings = []\n",
"\n",
"sample_size = 10\n",
"list_string_lengths = [random.randrange(10, 15, 1) for i in range(sample_size)]\n",
"\n",
"for i in list_string_lengths:\n",
" random_strings.append(create_random_string(i))\n",
"\n",
"print(random_strings)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "4470c185-e985-4e04-b282-69c118e441ab",
"showTitle": true,
"title": "1.2. Random string concatenated"
}
},
"outputs": [],
"source": [
"def create_random_username(num_characters: int, username_base: str) -> str:\n",
" random_string = ''.join(random.choices(ascii_letters, k=num_characters))\n",
" return f\"{username_base.replace(' ', '')}_{random_string}\"\n",
"\n",
"# Create UDF to use in PySpark DataFrame column creation\n",
"usernames_udf = F.udf(create_random_username,StringType())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "f99693d1-63f8-4c58-b5a9-a7ee5aed4c7f",
"showTitle": true,
"title": "2.1. Static API "
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Theoretical', 'Funding', 'Voice', 'Allows', 'Douglas', 'Tee', 'Counter', 'Posing', 'Andrew', 'Religious']\n"
]
}
],
"source": [
"# Function to hit the MIT word list page, retrieving 10000 words as a list.\n",
"def get_word_list() -> list[any]:\n",
" word_site = f\"https://www.mit.edu/~ecprice/wordlist.10000\"\n",
" \n",
" response = requests.get(word_site)\n",
" words = response.content.splitlines()\n",
"\n",
" words_array = np.array(words)\n",
" words_unique = np.unique(words_array)\n",
" words_unique = [f'{i.decode().capitalize()}' for i in words_unique ]\n",
" \n",
" return words_unique\n",
"\n",
"words_unique = get_word_list()\n",
"\n",
"sample_size = 10\n",
"\n",
"random.shuffle(words_unique)\n",
"sample_words = words_unique[:sample_size]\n",
"print(sample_words)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "ae6f4f5c-067b-4e5d-aaab-569e112002d1",
"showTitle": true,
"title": "2.2. Geographical API"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Afghanistan',\n",
" 'Albania',\n",
" 'Algeria',\n",
" 'American Samoa',\n",
" 'Andorra',\n",
" 'Angola',\n",
" 'Anguilla',\n",
" 'Antarctica',\n",
" 'Antigua and Barbuda',\n",
" 'Argentina',\n",
" 'Armenia',\n",
" 'Aruba',\n",
" 'Australia',\n",
" 'Austria',\n",
" 'Azerbaijan',\n",
" 'Bahamas',\n",
" 'Bahrain',\n",
" 'Bangladesh',\n",
" 'Barbados',\n",
" 'Belarus',\n",
" 'Belgium',\n",
" 'Belize',\n",
" 'Benin',\n",
" 'Bermuda',\n",
" 'Bhutan',\n",
" 'Bolivia',\n",
" 'Bosnia and Herzegovina',\n",
" 'Botswana',\n",
" 'Bouvet Island',\n",
" 'Brazil',\n",
" 'British Indian Ocean Territory',\n",
" 'British Virgin Islands',\n",
" 'Brunei',\n",
" 'Bulgaria',\n",
" 'Burkina Faso',\n",
" 'Burundi',\n",
" 'Cambodia',\n",
" 'Cameroon',\n",
" 'Canada',\n",
" 'Cape Verde',\n",
" 'Caribbean Netherlands',\n",
" 'Cayman Islands',\n",
" 'Central African Republic',\n",
" 'Chad',\n",
" 'Chile',\n",
" 'China',\n",
" 'Christmas Island',\n",
" 'Cocos (Keeling) Islands',\n",
" 'Colombia',\n",
" 'Comoros',\n",
" 'Cook Islands',\n",
" 'Costa Rica',\n",
" 'Croatia',\n",
" 'Cuba',\n",
" 'Curaçao',\n",
" 'Cyprus',\n",
" 'Czechia',\n",
" 'DR Congo',\n",
" 'Denmark',\n",
" 'Djibouti',\n",
" 'Dominica',\n",
" 'Dominican Republic',\n",
" 'Ecuador',\n",
" 'Egypt',\n",
" 'El Salvador',\n",
" 'Equatorial Guinea',\n",
" 'Eritrea',\n",
" 'Estonia',\n",
" 'Eswatini',\n",
" 'Ethiopia',\n",
" 'Falkland Islands',\n",
" 'Faroe Islands',\n",
" 'Fiji',\n",
" 'Finland',\n",
" 'France',\n",
" 'French Guiana',\n",
" 'French Polynesia',\n",
" 'French Southern and Antarctic Lands',\n",
" 'Gabon',\n",
" 'Gambia',\n",
" 'Georgia',\n",
" 'Germany',\n",
" 'Ghana',\n",
" 'Gibraltar',\n",
" 'Greece',\n",
" 'Greenland',\n",
" 'Grenada',\n",
" 'Guadeloupe',\n",
" 'Guam',\n",
" 'Guatemala',\n",
" 'Guernsey',\n",
" 'Guinea',\n",
" 'Guinea-Bissau',\n",
" 'Guyana',\n",
" 'Haiti',\n",
" 'Heard Island and McDonald Islands',\n",
" 'Honduras',\n",
" 'Hong Kong',\n",
" 'Hungary',\n",
" 'Iceland',\n",
" 'India',\n",
" 'Indonesia',\n",
" 'Iran',\n",
" 'Iraq',\n",
" 'Ireland',\n",
" 'Isle of Man',\n",
" 'Israel',\n",
" 'Italy',\n",
" 'Ivory Coast',\n",
" 'Jamaica',\n",
" 'Japan',\n",
" 'Jersey',\n",
" 'Jordan',\n",
" 'Kazakhstan',\n",
" 'Kenya',\n",
" 'Kiribati',\n",
" 'Kosovo',\n",
" 'Kuwait',\n",
" 'Kyrgyzstan',\n",
" 'Laos',\n",
" 'Latvia',\n",
" 'Lebanon',\n",
" 'Lesotho',\n",
" 'Liberia',\n",
" 'Libya',\n",
" 'Liechtenstein',\n",
" 'Lithuania',\n",
" 'Luxembourg',\n",
" 'Macau',\n",
" 'Madagascar',\n",
" 'Malawi',\n",
" 'Malaysia',\n",
" 'Maldives',\n",
" 'Mali',\n",
" 'Malta',\n",
" 'Marshall Islands',\n",
" 'Martinique',\n",
" 'Mauritania',\n",
" 'Mauritius',\n",
" 'Mayotte',\n",
" 'Mexico',\n",
" 'Micronesia',\n",
" 'Moldova',\n",
" 'Monaco',\n",
" 'Mongolia',\n",
" 'Montenegro',\n",
" 'Montserrat',\n",
" 'Morocco',\n",
" 'Mozambique',\n",
" 'Myanmar',\n",
" 'Namibia',\n",
" 'Nauru',\n",
" 'Nepal',\n",
" 'Netherlands',\n",
" 'New Caledonia',\n",
" 'New Zealand',\n",
" 'Nicaragua',\n",
" 'Niger',\n",
" 'Nigeria',\n",
" 'Niue',\n",
" 'Norfolk Island',\n",
" 'North Korea',\n",
" 'North Macedonia',\n",
" 'Northern Mariana Islands',\n",
" 'Norway',\n",
" 'Oman',\n",
" 'Pakistan',\n",
" 'Palau',\n",
" 'Palestine',\n",
" 'Panama',\n",
" 'Papua New Guinea',\n",
" 'Paraguay',\n",
" 'Peru',\n",
" 'Philippines',\n",
" 'Pitcairn Islands',\n",
" 'Poland',\n",
" 'Portugal',\n",
" 'Puerto Rico',\n",
" 'Qatar',\n",
" 'Republic of the Congo',\n",
" 'Romania',\n",
" 'Russia',\n",
" 'Rwanda',\n",
" 'Réunion',\n",
" 'Saint Barthélemy',\n",
" 'Saint Helena, Ascension and Tristan da Cunha',\n",
" 'Saint Kitts and Nevis',\n",
" 'Saint Lucia',\n",
" 'Saint Martin',\n",
" 'Saint Pierre and Miquelon',\n",
" 'Saint Vincent and the Grenadines',\n",
" 'Samoa',\n",
" 'San Marino',\n",
" 'Saudi Arabia',\n",
" 'Senegal',\n",
" 'Serbia',\n",
" 'Seychelles',\n",
" 'Sierra Leone',\n",
" 'Singapore',\n",
" 'Sint Maarten',\n",
" 'Slovakia',\n",
" 'Slovenia',\n",
" 'Solomon Islands',\n",
" 'Somalia',\n",
" 'South Africa',\n",
" 'South Georgia',\n",
" 'South Korea',\n",
" 'South Sudan',\n",
" 'Spain',\n",
" 'Sri Lanka',\n",
" 'Sudan',\n",
" 'Suriname',\n",
" 'Svalbard and Jan Mayen',\n",
" 'Sweden',\n",
" 'Switzerland',\n",
" 'Syria',\n",
" 'São Tomé and Príncipe',\n",
" 'Taiwan',\n",
" 'Tajikistan',\n",
" 'Tanzania',\n",
" 'Thailand',\n",
" 'Timor-Leste',\n",
" 'Togo',\n",
" 'Tokelau',\n",
" 'Tonga',\n",
" 'Trinidad and Tobago',\n",
" 'Tunisia',\n",
" 'Turkey',\n",
" 'Turkmenistan',\n",
" 'Turks and Caicos Islands',\n",
" 'Tuvalu',\n",
" 'Uganda',\n",
" 'Ukraine',\n",
" 'United Arab Emirates',\n",
" 'United Kingdom',\n",
" 'United States',\n",
" 'United States Minor Outlying Islands',\n",
" 'United States Virgin Islands',\n",
" 'Uruguay',\n",
" 'Uzbekistan',\n",
" 'Vanuatu',\n",
" 'Vatican City',\n",
" 'Venezuela',\n",
" 'Vietnam',\n",
" 'Wallis and Futuna',\n",
" 'Western Sahara',\n",
" 'Yemen',\n",
" 'Zambia',\n",
" 'Zimbabwe',\n",
" 'Åland Islands']\n",
"['Taiwan', 'Spain', 'Serbia', 'Benin', 'Iraq', 'India', 'Montenegro', 'Malaysia', 'Bahrain', 'Palau']\n"
]
}
],
"source": [
"# Documentation at https://restcountries.com/\n",
"def get_countries() -> list():\n",
" countries_site = f\"https://restcountries.com/v3.1/all?fields=name\"\n",
" response = requests.get(countries_site)\n",
" json_response = response.json()\n",
" \n",
" countries_list = []\n",
" for i in json_response:\n",
" countries_list.append(i['name']['common'])\n",
" \n",
" return countries_list\n",
"\n",
"countries_list = get_countries()\n",
"\n",
"pprint(sorted(countries_list))\n",
"\n",
"sample_size = 10\n",
"\n",
"random.shuffle(countries_list)\n",
"sample_countres = countries_list[:sample_size]\n",
"print(sample_countres)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "3b07d34e-a4c1-4878-9a36-dac305c2fa31",
"showTitle": true,
"title": "2.3. Recursive API "
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['delectus aut autem', 'quis ut nam facilis et officia qui', 'fugiat veniam minus', 'et porro tempora', 'laboriosam mollitia et enim quasi adipisci quia provident illum', 'qui ullam ratione quibusdam voluptatem quia omnis', 'illo expedita consequatur quia in', 'quo adipisci enim quam ut ab', 'molestiae perspiciatis ipsa', 'illo est ratione doloremque quia maiores aut']\n"
]
}
],
"source": [
"#https://realpython.com/api-integration-in-python/\n",
"\n",
"def get_string_from_api(page_id: int) -> str: \n",
" api_url = f\"https://jsonplaceholder.typicode.com/todos/{i+1}\"\n",
" response = requests.get(api_url)\n",
" title = response.json()['title']\n",
" return title\n",
"\n",
"api_words = []\n",
"for i in range(10):\n",
" api_words.append(get_string_from_api(i))\n",
"print(api_words)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "aa9da320-bb18-4804-aaa3-aa320998cadd",
"showTitle": true,
"title": "3. Faker third-party module"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Benjamin Henry', 'Jessica Ward', 'Jennifer Garrett', 'Kevin Burch', 'Ian Daniel', 'Sean Marshall', 'Michael Johnson', 'Gregory Kelley', 'Deborah Powell', 'James Brown', 'Sharon Obrien', 'Sara Young', 'Jose Weiss', 'Jacob Spears', 'Robert Kelly', 'Jennifer Perez', 'Deborah Richardson', 'David Choi', 'Gary Mcguire', 'Jeremy Gallagher', 'Kristen Carter', 'Rhonda Odom', 'Toni Harvey', 'Douglas Hernandez', 'Mary Montgomery', 'James Kidd', 'William Riggs', 'Tina Fitzgerald', 'Ryan Evans', 'Jasmine Garcia', 'Taylor Orr', 'Robert Young', 'Anna Miller', 'John Lopez', 'Ian Smith', 'Gregory Lawson', 'Jonathan Kirk', 'Melvin Rodriguez', 'Sarah Thomas', 'David Bonilla', 'Jose Moore', 'Heather Poole', 'Kendra Winters', 'Elizabeth Hopkins', 'Laura Robles', 'Brian James', 'Teresa Potter', 'Carol Becker', 'Dr. Stacey Matthews', 'Michael Torres', 'Michelle Watkins', 'Linda Gonzalez', 'Nancy Hanson', 'Sherri Martin', 'Hannah Stephens', 'Timothy Franklin', 'Anthony Gibbs', 'Jonathan Moran', 'Karen Garcia', 'Daniel Vargas', 'Amanda Small', 'Steven Butler', 'Natalie Rosales', 'Tabitha Juarez', 'Jaclyn Chen', 'Michael Whitehead', 'Caitlyn Bryant', 'Douglas Campbell', 'Stacey Butler', 'James Collins', 'Kathryn Harris', 'Chase Jones', 'Elizabeth Pratt', 'Sylvia Martinez', 'Dennis Young', 'Kimberly Clark', 'Mary Weiss', 'Pamela Lopez', 'Eric Mckee', 'Ryan Watson', 'Jessica Eaton', 'Colin Carter', 'Shawn Horton', 'Clinton Hendricks', 'Wanda Crawford', 'Ronald Clayton', 'Michelle Hernandez', 'Amanda Moore', 'Evelyn Davis', 'Daniel Turner', 'Eric Mckenzie', 'Amanda Chen', 'Morgan Charles', 'Morgan Harris', 'William Wolfe', 'Stephen Davidson', 'Melissa Scott', 'Brian Martin', 'Adam Harris', 'Andrew Owens']\n"
]
}
],
"source": [
"from faker import Faker\n",
"\n",
"fake = Faker()\n",
"\n",
"def create_faker_name():\n",
" return fake.name()\n",
"\n",
"sample_size = 100 \n",
"list_names = [create_faker_name() for i in range(sample_size)]\n",
"print(list_names)\n",
"\n",
"def faker_udf_func(list_names: list(), person_id: int):\n",
" return list_names[person_id]\n",
"\n",
"faker_udf = F.udf(faker_udf_func, StringType())"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "13801e47-d3ba-4924-a970-5e171ec0c895",
"showTitle": true,
"title": "4. ChatGPT generated function"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error: Unable to fetch activity. Status code: 503\n",
"None\n"
]
}
],
"source": [
"def get_bored_activity():\n",
" # Bored API endpoint\n",
" api_url = \"https://www.boredapi.com/api/activity\"\n",
"\n",
" try:\n",
" # Make a GET request to the Bored API\n",
" response = requests.get(api_url)\n",
" \n",
" # Check if the request was successful (status code 200)\n",
" if response.status_code == 200:\n",
" # Parse the JSON response\n",
" activity_data = response.json()\n",
"\n",
" # Extract and return the activity\n",
" activity = activity_data.get(\"activity\", \"No activity found\")\n",
" return activity\n",
" else:\n",
" # Print an error message if the request was not successful\n",
" print(f\"Error: Unable to fetch activity. Status code: {response.status_code}\")\n",
" except Exception as e:\n",
" print(f\"An error occurred: {e}\")\n",
"\n",
"# Example usage\n",
"bored_activity = get_bored_activity()\n",
"print(bored_activity)\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "7e5dcf1a-ebb3-439d-9b48-d620a0a2abe3",
"showTitle": true,
"title": "Build DataFrame"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"24/12/10 16:29:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
},
{
"data": {
"text/plain": [
"DataFrame[id: bigint, Country: string, Full Name: string, Username: string]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Example Dimension\n",
"from pyspark.sql import SparkSession\n",
"\n",
"spark = SparkSession.builder.getOrCreate()\n",
"min_number = 0\n",
"partitions_requested = 4\n",
"\n",
"data_rows = 100\n",
"\n",
"dataspec = (dg.DataGenerator(spark, name=\"dataset\", rows=data_rows, partitions=partitions_requested)\n",
" .withIdOutput()\n",
" .withColumn(\"Country\", \"string\", values=countries_list, random=True)\n",
" )\n",
"\n",
"df = (dataspec.build().cache())\n",
"\n",
"df = df.withColumn(\"Full Name\", faker_udf(F.lit(list_names), df[\"id\"]))\n",
"df = df.withColumn(\"Username\", usernames_udf(F.lit(5), df[\"Full Name\"]))\n",
"\n",
"display(df)"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 4
},
"notebookName": "Random Data Generator",
"widgets": {}
},
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment