Created
January 12, 2023 20:34
-
-
Save Yonet/797ca3059294e5c82ebe15682135e965 to your computer and use it in GitHub Desktop.
Cleaning the Airbnb data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { readFile } = require('node:fs/promises'); | |
const { writeFile } = require('node:fs/promises'); | |
const { resolve } = require('node:path'); | |
const deletePropertiesList = ["scrape_id", "source", "last_scraped", "scrape_id", "name", "summary", "space", "description", "experiences_offered", "price", "accomodates", "picture_url", "last_scraped", "source", "host_id", "host_name", "host_since", "host_location", "host_about", "host_response_time", "host_response_rate", "host_acceptance_rate", "host_is_superhost", "host_thumbnail_url", "host_picture_url", "host_neighbourhood", "host_listings_count", "host_total_listings_count", "host_verifications", "host_has_profile_pic", "host_identity_verified", "street", "neighbourhood", "neighbourhood_cleansed", "neighbourhood_group_cleansed", "city", "state", "zipcode", "market", "smart_location", "country_code", "country", "is_location_exact", "room_type", "accommodates", "bathrooms", "bedrooms", "beds", "bed_type", "amenities", "square_feet", "weekly_price", "cleaning_fee", "guests_included", "extra_people", "minimum_nights", "maximum_nights", "calendar_updated", "has_availability", "availability_30", "availability_60", "availability_90", "availability_365", "calendar_last_scraped", "number_of_reviews", "first_review", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value", "requires_license", "license", "jurisdiction_names", "instant_bookable", "cancellation_policy", "require_guest_profile_picture", "require_guest_phone_verification", "calculated_host_listings_count", "reviews_per_month", "minimum_minimum_nights", "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", | |
"minimum_nights_avg_ntm", | |
"maximum_nights_avg_ntm", | |
"number_of_reviews_ltm", | |
"number_of_reviews_l30d", "host_url", "calculated_host_listings_count_entire_homes", | |
"calculated_host_listings_count_private_rooms", | |
"calculated_host_listings_count_shared_rooms",]; | |
// parse a date in yyyy-mm-dd format | |
function parseDate(input) { | |
let parts = input.split('-'); | |
// new Date(year, month [, day [, hours[, minutes[, seconds[, ms]]]]]) | |
return new Date(parts[0], parts[1] - 1, parts[2]); // Note: months are 0-based | |
} | |
function createSlug(title) { | |
return title.toLowerCase() | |
.replace(/ /g, '-') | |
.replace(/[^\w-]+/g, ''); | |
} | |
function deleteProperties(data, deletePropertiesList) { | |
for (const property of deletePropertiesList) | |
{ | |
delete data[property]; | |
} | |
} | |
async function logFile() { | |
try | |
{ | |
const filePath = resolve('./listings.json'); | |
const newFilePath = resolve('./listings01.json'); | |
const contents = JSON.parse(await readFile(filePath, { encoding: 'utf8' })); | |
console.log(typeof (contents)); | |
contents.map(data => { | |
const slug = createSlug(data.name); | |
data.fees = { "rent": data.price }; | |
data.slug = slug; | |
data.capacity = data.accomodates; | |
data.createdAt = parseDate(data.host_since); | |
data.address = { | |
"id": data.id, | |
"latitude": data.latitude, | |
"longitude": data.longitude, | |
"slug": slug, | |
"street": data.address || "1234 Main St", | |
"city": data.city || "San Francisco", | |
"state": data.state || "CA", | |
"zipCode": data.zip || "94103", | |
"country": "US", | |
"neighbourhood": data.neighbourhood, | |
"buildingNumber": Math.random() * 10, | |
"neighbourhood_cleansed": data.neighbourhood_cleansed, | |
"createdAt": parseDate(data.first_review), | |
}; | |
data.title = data.name; | |
data.photos = [data.picture_url, data.picture_url, data.picture_url, data.picture_url]; | |
data.isFeatured = true; | |
data.isRecommended = false; | |
data.reviews = { | |
"id": data.id, | |
"slug": slug, | |
"userId": data.host_id, | |
"rating": data.review_scores_rating, | |
"listingId": data.id, | |
"comment": ["Great place to stay", "I would stay here again", "I would not stay here again"], | |
createdAt: parseDate(data.last_review), | |
}; | |
deleteProperties(data, deletePropertiesList); | |
}); | |
const newJson = await writeFile(newFilePath, JSON.stringify(contents), console.log('done')); | |
// console.log(newJson); | |
} catch (err) | |
{ | |
console.error(err.message); | |
} | |
} | |
logFile(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment