All Design Employment Eric's Works Programming & Coding

Code Sample: JavaScript ES6 – Scrape OneTab Links (2018)

Here is a program I wrote in 2018. It usues vanilla JavaScript ES6 to scrape and parse links on a OneTab bookmarks page and outputs the results as HTML. While the resulting JavaScript document isn’t a web page, per se, it can be copied into an HTML document and saved.

(also on GitHub)

/*
Program Name:   Console: Scrape OneTab Links
File Name:      ehCode_2018.03.05_JavaScriptES6_ScrapeOneTabLinks_01.js
Date Created:   02/27/18
Date Modified:  04/01/18
Version:        1.02
Programmer:     Eric Hepperle

Purpose: Parses links and information from OneTab.
    Displays categories in console. Uses vanilla JavaScript ES6. 
    
    NOTE: Can't	inject jQuery anymore due to "Content Security Policy",
     so this version use vanilla JavaScript ES6.

Usage: Open OneTab page in a browser and copy-paste the code below
    into the console.

    To save/archive the links results use code inspector in browser to
    grab the "body" tag and contents, then paste that into a new document
    and save it.	
    
    Sample results: N/A	

Requires: 
    * Browser console

*/

/* global $ */
/*jshint esversion: 6 */

console.clear();

// ********************** GLOBAL VARIABLES
    
    // =========== Output Variables =========
    // Groups array to store all group info (this is the root)
    var objArrGroups = [];
    
    // Output string to generate new page
    var strOut = '';
    // =========== END Output Variables =====

    
    // =========== Row Variables ============
    // Row Link
    var rowLink = '';
    
    // Row Text
    var rowText = '';
    
    // Icon URL
    var rowIconLink = '';
    
    // Row domain (parse from icon url)
    var rowDomain = '';	
    // ========== END Row Variables =========
        

    // =========== Counters =================
    // Blank Title Count
    var blankTitleCount = 1;

    // Group Counter = 1
    var groupCount = 1;

    // Total number of groups counted
    var groupsTotal = 0;
    
    // Row Counter
    var rowCount = 1;
    //============ END Counters =============

    
    // =========== Selector Constants =======
    // skips first 3 children
    var selAllGroups = "#contentAreaDiv > div:nth-child(n+4):not(:nth-last-child(-n+1)";
    
    var selGroupTitle = "div.tabGroupTitleText";
    // =========== END Selector Constants ===
    

    // =========== CSS Style Constants ======
    var aliceblue_dashed = "background:aliceblue; border-bottom: dashed 3px cadetblue";
    var lemonyellow_dashed = "background:#ffffb3; border-bottom: dashed 3px orange";
    var lemyel = "background:#ffffb3";	
    var ltgrn = "background:lightgreen";
    // =========== END CSS Style Constants ==

// ********************** HELPER FUNCTIONS

/*
pad()

Usage:

pad(10, 4);      // 0010
pad(9, 4);       // 0009
pad(123, 4);     // 0123
pad(10, 4, '-'); // --10
pad(10, 4, ' '); //   10
*/
function pad(n, width, z) {
  z = z || '0';
  n = n + '';
  return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
}	

// ********************** MAIN
        
// Get All Group And Row Info And Store In Array Of Objects:
function getAllGroups() {

    // Grab list of all link groups and row info
    var groups = [...document.querySelectorAll(selAllGroups)];

    // Store the groups array length
    groupsTotal = groups.length;
    
    // FOREACH GROUP
    groups.forEach(function(el, i, arr) {
                
        // Create empty object to store group data
        var group = {};

        // Create empty group title variable
        var groupTitle = '';
        
        // Which line of group info the groupName is on
        var groupNameStartLineIndex = 2;
        
        // If group title exists, store in a variable. Else,
        //  build group title from blank title counter. NOTE:
        //	testing for " " doesn't work but fromCharCode does.
        if (el.querySelector(selGroupTitle).innerText && 
            el.querySelector(selGroupTitle).innerText !== String.fromCharCode(160)
        ) {
            groupTitle = el.querySelector(selGroupTitle).innerText;
        } else {
            groupTitle = "blankGroup_" + blankTitleCount;
                
            // increment blank title counter
            ++blankTitleCount;
            groupNameStartLineIndex = 1;
        }	
        
        // Add group title to group object
        group.groupTitle = groupTitle;

        // Grab group details block
        var thisGroupDetails = el.querySelector("div > div > div").innerText
        console.log("%cGroup Info (incl. date):                         ", "background:orange");
        console.log(thisGroupDetails);

        // Parse group details block for 
        var arrGroupDetails = thisGroupDetails.split('\n');
        
        var tempTimeDate = arrGroupDetails[groupNameStartLineIndex];
        
        // Determine what line of group info the date is on:
        if (tempTimeDate.includes("tabs")) {
            
            tempTimeDate = arrGroupDetails[(groupNameStartLineIndex+1)];
            
        } else {
            tempTimeDate = arrGroupDetails[groupNameStartLineIndex];
        }
        
        
        console.log("%ctempTimeDate: %s                                ", "background: lavender; border: solid gold 2px;", tempTimeDate);
        // Parse time date with Regex like: 
        // 	Created 6/21/2016, 1:41:28 PM
        var reg = /^Created\s+(\d{1,2})\/(\d{1,2})\/(\d{4}),\s(\d{1,2}):(\d{1,2}):(\d{1,2})\s([APM]{2})$/;
        var matches = reg.exec(tempTimeDate);
        
        // debugging ... 
        console.log("%c************ MATCHES ************** ", "background:yellow");
        console.log(matches);
        
        var monthNum = matches[1] ? matches[1] : 'no-month';
        var dayNum = matches[2];
        var year4 = matches[3];
        var hourNum = matches[4];
        var minuteNum = matches[5];
        var secondNum = matches[6];
        var ampm = matches[7];
        
        // var date = matches[1] + "/" + matches[2] + "/" + matches[3];
        // var time = matches[4] + ":" + matches[5] + ":" + matches[6] + " " + matches[7];
        var date = monthNum + "/" + dayNum + "/" + year4;
        var time = hourNum + ":" + minuteNum + ":" + secondNum + " " + ampm;
        
        // Grab just date and time
        console.log("%cDate:                         ", "background:bisque");
        console.log(date);
        console.log("%cTime:                         ", "background:bisque");
        console.log(time);
        
        // Add date and time info to group object. This will help with sorting
        group.year = year4;
        group.date = date;
        group.time = time;
        group.monthNum = monthNum;
        group.dayNum = dayNum;
        group.hourNum = hourNum;
        group.minuteNum = minuteNum;
        group.secondNum = secondNum;
        group.ampm = ampm;
        
        // Grab list of all rows in this group
        var rows = Array.from(el.children[1].children);
        // debugging ... child rows
        console.log("%cChild Rows:                        ", lemonyellow_dashed);
        console.log(rows);
        
        // Create rows array
        arrGroupRows = [];
        
        // Reset row counter to 1
        rowCount = 1;
        
        // Foreach Row:
        rows.forEach(function(el, i, arr) {

            // Create row object
            var rowObj = {};
            
            // Grab row link
            // rowLink = el.querySelector('.row_text > a').href;
            rowLink = el.children[1].querySelector('a').href;
            
            // Grab row text
            rowText = el.children[1].querySelector('a').text;
            
            // Grab icon link
            rowIconLink = el.children[1].querySelector('img').src;
            
            // Add all row data to row object
            rowObj.rowText = rowText;
            rowObj.rowLink = rowLink;
            rowObj.rowIconLink = rowIconLink;
            
            // Push this row object onto group rows array
            arrGroupRows.push(rowObj);
                                    
            // Increment row counter
            ++rowCount;
        
        });	
        // END processing rows in this group		
        
        // Add group rows array onto this group as property
        group.rows = arrGroupRows;
        
    // Push this group object onto groups array
    objArrGroups.push(group);
    
    }); 
    // END processing groups

    objArrGroups.blankTitleCount = blankTitleCount;
    
    return objArrGroups;
    
} // END function

var groupInfo = getAllGroups();

// uncomment to output object
// console.log("%c --- GROUP INFO ---                           ", "background:#ffffb3;");
// console.log(groupInfo); 



// --------------------------------------------------------------------
// Create webpage by parsing the groups object and
// 	launch in new window.

// add doctype and header to html output string
strOut += "<!DOCTYPE html>\n";
strOut += "<html lang='en'>\n";
strOut += "\t<head>\n";
strOut += "\t<title>Scraped Links Output Page</title>\n";
strOut += "\t<meta charset='utf-8'>\n";

var testTemplateLiteralStyle = `
<style>
.group-info {
    background-color: orange;
    border: solid black 2px;
    border-radius: 15px;
    padding: 10px;
    max- width: 1024px;
    display: inline-block;
}

.group-title {
    float: left;
    position: relative;
    top: -.6em;
}

.group-table {
    float: left;
    border: solid 3px gold;
    margin-left: 9em;
    background: #ffffb3;
    border-radius: .8em;
    padding: .6em;
    font-family: "courier new";
    font-size: .8em;
}

.clear:after {
  content: "";
  clear: both;
  display: table;
}

.row-icon {
    width: 16px;
    height: 16px;
}

/* Note: clear the div and the table */
</style>
`;

strOut += testTemplateLiteralStyle;
strOut += "<body>\n";

groupInfo.forEach(function(group, groupIndex, groupArr) {

    // begin this group html string
    var htmGroup = '';

    var htmGroupInfo = "<div class='clear group-info'>\n";
    
    // Build formatted group title:
    var htmGroupTitle = "<h2 class='group-title'>" + group.groupTitle + "</h2>\n";

    // Build formatted group table:
    var htmGroupTable = "<table class='clear group-table'>\n";
    htmGroupTable += "\t<tr>\n\t\t<td class='info-label'>Date & Time:</td>\n\t\t<td>" + group.date + ", " + group.time + "</td>\n\t</tr>\n";
    htmGroupTable += "\t<tr>\n\t\t<td class='info-label'>Group #:</td>\n\t\t<td>" + (groupIndex+1) + "</td>\n\t</tr>\n";
    htmGroupTable += "</table>\n";
    
    // Build formatted group header and info:
    htmGroupInfo += htmGroupTitle + htmGroupTable + "</div>\n"
    
    // debugging ...
    console.log(group.groupTitle);
    
    // Begin current link list:
    var htmRowsList = "<ul style='list-style: none'>";
    
    // debugging ...
    //var linksCount = [...document.querySelector('.row_text > a')].length;
    //console.log("linksCount = " + linksCount);

    // Loop through all rows in this group ...        
    group.rows.forEach(function(row, rowIndex, rowArr) {

        // Format row index to 3 padded digits
        var formattedRowNum = pad((rowIndex+1), 3);
        // debugging ... formattedRowNum
        console.log("%cFormatted row number = %s", lemonyellow_dashed, formattedRowNum);

        
        // start row list item
        var htmRow = "<li class='row'>[Row #: " + formattedRowNum + "]: ";

        // add icon image to row
        htmRow += "\t<img alt='favicon for " + rowDomain + "'"
        + "class='row-icon' src='" + row.rowIconLink + "'\\>";
        
        // add hyperlink to row
        htmRow += "\t<a href='" + row.rowLink + "' target='_blank' >" +
        row.rowText + "</a></li>";
        
        // add row html to the rows string
        htmRowsList += htmRow;

    });        

    // close current link list
    htmRowsList += "</ul><!-- END group -->\n";

    // assemble html parts for this group
    htmGroup += htmGroupInfo + htmRowsList + "<hr />";
    
    // Add this group's html to out page html
    strOut += htmGroup;

});

// Add closing tags to html page string
strOut += "</body>\n</html>\n";

// Launch results in new window:
var win = window.open("", "APPLES");
win.document.body.innerHTML = strOut;


/*

NOTES:

    04/01/18 - Verisoned to 1.02.
             - Restricts favicon display size to 16x16 px.
        
    03/05/18 - Versioned to 1.01.
             - Previous version didn't work correctly. #GOTCHA It used to be
                working code, but all of a sudden it wasn't working. Today,
                I realized the issue was because of how starring a OneTab
                group adds an element to the group info, so when I'm trying to parse a certain index, it fails as null. Instead of a date string, it is an element that says "7 tabs", "12 tabs", etc.
             - #SOLVED: If tempDate includes "tabs", then look for date in'
                the next index (+1). WORKS now! :)

    02/27/18 - Created file from 11/27/17 version.
             - Versioned as #1.
             - Improved global variable organization.
             - Reorganized 'Notes' section to be descending by date
                #GOTCHA: Realized that my comments are so long now,
                it makes sense to have the latest ones on top so
                I don't have to scroll! :)
             - NOTE: This doesn't work for some reason. I KNOW that I fixed
                this within the last few months, but I can't find the
                working version. :(
             - 

    11/27/17 - Parses date and time from group info block
             - Versioned to 10.0
             - Refactored time/date parts into better semantic variables
             - Stores date and time info to groups.
             - Correctly grabs rows and stores to group! :) WORKS!!!
             - Generates new page correctly, except misses the last group.
             - #GOTCHA Figure out this off-by-one error.
             - Versioned to 11.0.
             - Removed debugs
             - Changed selAllGroups to (-n+1) from (-n+2). WORKS! :)
             - Changed group heading to include some other info
             - Adds template literal to inject styles into out page
             - Replaced inline styles with classes including clearfix
             - Formatted group info divs to be inline-block no float
             - Adds number padding function from:
             https://stackoverflow.com/questions/10073699/pad-a-number-with-leading-zeros-in-javascript
             
             - Formatted row numbers to pad with leading spaces.
    
    11/26/17 - Versioned to 9.0 - This version we will adapt to work with
                OneTab.
             - Refactored selectors as constants
             - Adds color & CSS style constants.
             - Adds groupsTotal to store total number of groups.
             - Adds debugs in getAllGroups()
             - SOLVED for How to exclude children from front and end with:

                var selAllGroups = "#contentAreaDiv > div:nth-child(n+4):not(:nth-last-child(-n+2)";
             			 
             - This link explains HOW TO TEST FOR &nbsp (non-breaking space)
                
                if (x == String.fromCharCode(160))
             
             - #GOTCHA Those last two wer gotchas.
             - Adds blankTitleCount as property of objArrGroups.
             - Grabs and stores group info
             - We have an "off-by-one" error and are not grabbing the last group for some reason
             				
    11/25/17 - Versioned file to 5.0.
             - Removed redundant function.
             - Created algorithm on paper.
             - Versioned to 6.0 based on my algorithm.
             - Troubleshot code --> arrays and objects. This post helped:
             
             https://www.sitepoint.com/get-url-parameters-with-javascript/
             
             - 10:47 AM (CST) WORKS !!!! :) So far, this version builds a
             groupInfo object which stores all group data as JSON.
             
             - NOTE: This works on sample links list page, but has not been
                converted for OneTab yet.
                
             - Versioned to 7.0
             - Started from scratch with algorithm and left out most debugging
                and console logging. This is cleaner code.
             - WORKS!!! Generates new page! with grouped links! :) Next, test
                in OneTab
             - Versioned to 8.0
             - Added HTML header to results page --> sort of works. You will still
                need to copy the inner html of the whole document and move
                to a new file for archiving.
    
    11/24/17 - Duplicated file to make edits. Versioned to 2.0.
             - Added comments to document code and make it more understandable.
             - Replaced MS Word apostrophes with single-quotes
             - Changed arrTabGroups from const to a var and removed square
                brackets.
             - Changed selector for arrTabGroups to: 
                #contentAreaDiv div.row_text > a');
             - Adds elCount var as blank and undefines/unsets arrTabGroups
                at start of program.
             - Removes co-existing 2nd version of top arrTabGroups code. This
                was just for testing if we actually are getting all the links.
                YES!!! It works! :) Now we will make that a function.
             - Refactored code into getTabGroups() function. Still works! :)
             - Added the ... with brackets back after learning this is the
                ES6 way to auto convert a nodelist to an array. Still works! :)
             - Now we are going to see if we can parse more than just url.
             - Works! I'm getting img icon and link url. NEXT, let's get link text
                and lets make sure to build objects that we can iterate over.
                Here is a great link: https://www.sitepoint.com/dom-manipulation-vanilla-javascript-no-jquery/#modifyingthedom
                
    11/01/17 - Created file, beginning from copy of scrapeLinksFromOneTab3.
             - Converting from jQuery to vanilla javascript.	
    
-----------------------------


*/

/*

IDEAS & FUTURE IMPROVEMENTS:

- Consider de-duplicating icon urls
- Get domain from icon url
- Grab group date and time
- Sort by date and time
- Can I create a global row object?
- What is the expense of creating a new rowobject variable each iteration?
- Create JavaScript plugin or library from this.

*/

 

0 0 votes
Article Rating
Subscribe
Notify of
guest
0 Comments
Inline Feedbacks
View all comments