Last active
April 29, 2024 02:55
-
-
Save swsalim/664937980b4333300def43c99207cc3e to your computer and use it in GitHub Desktop.
Clean APIFY Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as fs from 'fs'; | |
import * as path from 'path'; | |
// Define a type alias for the days of the week | |
type Days = { | |
monday: string; | |
tuesday: string; | |
wednesday: string; | |
thursday: string; | |
friday: string; | |
saturday: string; | |
sunday: string; | |
}; | |
// Use the type alias when declaring the constant | |
const days: Days = { | |
monday: 'Mo', | |
tuesday: 'Tu', | |
wednesday: 'We', | |
thursday: 'Th', | |
friday: 'Fr', | |
saturday: 'Sa', | |
sunday: 'Su', | |
}; | |
const slugify = (str: string) => { | |
// remove accents, swap ñ for n, etc | |
const from = 'àáãäâèéëêìíïîòóöôùúüûñç·/_,:;'; | |
const to = 'aaaaaeeeeiiiioooouuuunc------'; | |
const slug = str.split('').map((letter, i) => { | |
return letter.replace(new RegExp(from.charAt(i), 'g'), to.charAt(i)); | |
}); | |
return ( | |
// Replace multiple - with single - | |
slug | |
.toString() // Cast to string | |
.toLowerCase() // Convert the string to lowercase letters | |
.trim() // Remove whitespace from both sides of a string | |
.replace(/\s+/g, '-') // Replace spaces with - | |
.replace(/\/+/g, '-') // Replace / with - | |
.replace(/&/g, '-and-') // Replace & with 'and' | |
// eslint-disable-next-line no-useless-escape | |
.replace(/[^\w\-]+/g, '') // Remove all non-word chars | |
// eslint-disable-next-line no-useless-escape | |
.replace(/\-\-+/g, '-') | |
); | |
}; | |
function processData(jsonData: any): any { | |
const result: any[] = []; | |
for (let i = 0; i < jsonData.length; i++) { | |
const place = jsonData[i]; | |
if (place.openingHours.length === 0) { | |
continue; | |
} | |
if ((place.website ?? '').includes('healthhub')) { | |
place.website = ''; | |
} | |
place.latLng = `${place.location.lat}, ${place.location.lng}`; | |
const keysToDelete = [ | |
'popularTimesHistogram', | |
'popularTimesLiveText', | |
'popularTimesLivePercent', | |
'peopleAlsoSearch', | |
'additionalInfo', | |
'location', | |
'address', | |
'subTitle', | |
'description', | |
'menu', | |
'categoryName', | |
'neighborhood', | |
'locatedIn', | |
'plusCode', | |
'placeId', | |
'categories', | |
'cid', | |
'imageCategories', | |
'searchPageUrl', | |
'searchPageLoadedUrl', | |
'searchString', | |
'scrapedAt', | |
'imagesCount', | |
'webResults', | |
'orderBy', | |
'reviewsTags', | |
'questionsAndAnswers', | |
'updatesFromCustomers', | |
'reserveTableUrl', | |
'googleFoodUrl', | |
'hotelStars', | |
'rank', | |
'claimThisBusiness', | |
'hotelDescription', | |
'checkInDate', | |
'checkOutDate', | |
'similarHotelsNearby', | |
'hotelReviewSummary', | |
'hotelAds', | |
'placesTags', | |
'gasPrices', | |
]; | |
keysToDelete.forEach((key) => { | |
delete place[key]; | |
}); | |
place.slug = slugify(place.title); | |
result.push(place); | |
} | |
return result; | |
} | |
function processFile(inputFileName: string) { | |
const inputFileFullPath = path.resolve(inputFileName); | |
const fileContent = fs.readFileSync(inputFileFullPath, 'utf-8'); | |
const jsonData = JSON.parse(fileContent); | |
const processedData = processData(jsonData); | |
// Define the parsedData directory path | |
const parsedDataDir = path.resolve('./tasks/data/parsedData'); | |
// Ensure the parsedData directory exists | |
if (!fs.existsSync(parsedDataDir)) { | |
fs.mkdirSync(parsedDataDir, { recursive: true }); | |
} | |
// Prepend 'processed-' to the original file name to create the output file name | |
const outputFileName = `processed-${path.basename(inputFileName)}`; | |
// Update the outputFileFullPath to include the parsedData directory | |
const outputFileFullPath = path.join(parsedDataDir, outputFileName); | |
// Save the processed data to a new JSON file | |
fs.writeFileSync(outputFileFullPath, JSON.stringify(processedData, null, 2), 'utf-8'); | |
console.log(`Processed file saved as: ${outputFileFullPath}`); | |
} | |
const command = process.argv[2]; | |
if (!command) { | |
console.error('Please provide a file name.'); | |
process.exit(1); | |
} | |
const filename = `./tasks/data/${command}`; | |
processFile(filename); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment