Address
304 North Cardinal St.
Dorchester Center, MA 02124
Work Hours
Monday to Friday: 7AM - 7PM
Weekend: 10AM - 5PM
Address
304 North Cardinal St.
Dorchester Center, MA 02124
Work Hours
Monday to Friday: 7AM - 7PM
Weekend: 10AM - 5PM
Well, once I hit over 4300 bookmarks, I decided it was time to write a script that would get me some bookmark stats (such as folders, number per folder, duplicates, etc.).
When I initially set out, I wanted to figure out what information I could get from a browser HTML export that would be useful (as well as how to display it).
import collections
import cProfile
import re
import requests
from bs4 import BeautifulSoup
# http://www.quesucede.com/page/show/id/python-3-tree-implementation
class Node:
def __init__(self, identifier, parent=None):
self.__identifier = identifier
self.__children = []
self.__parent = parent
@property
def identifier(self):
return self.__identifier
@property
def children(self):
return self.__children
@property
def parent(self):
return self.__parent
def add_child(self, identifier):
self.__children.append(identifier)
class Tree:
def __init__(self):
self.__nodes = {}
@property
def nodes(self):
return self.__nodes
def add_node(self, identifier, parent=None):
if parent is not None:
node = Node(identifier, self.__nodes[parent])
else:
node = Node(identifier)
self[identifier] = node
if parent is not None:
self[parent].add_child(identifier)
return node
def display(self, identifier, depth=0):
children = self[identifier].children
if depth == 0:
print identifier
else:
print "\t" * depth + str(identifier)
depth += 1
for child in children:
self.display(child, depth) # recursive call
def traverse(self, identifier, mode="depth"):
# Python generator. Loosly[sic] based on an algorithm from
# 'Essential LISP' by John R. Anderson, Albert T. Corbett,
# and Brian J. Reiser, page 239-241
yield self.__nodes[identifier]
queue = self[identifier].children
while queue:
yield self.__nodes[queue[0]]
expansion = self[queue[0]].children
if mode == "depth":
queue = expansion + queue[1:] # dfs
elif mode == "breadth":
queue = queue[1:] + expansion # bfs
def __getitem__(self, key):
return self.__nodes[key]
def __setitem__(self, key, item):
self.__nodes[key] = item
def createSoup(browser):
with open ("./bookmarks_" + browser + ".html", "r") as myfile:
html = myfile.read()
soup = BeautifulSoup(html, 'html.parser')
return soup
def getChildren(theNode, level):
children = []
theChildren = theNode.findAll('dl')
for child in theChildren:
parents = len(child.findParents('dl'))
header = child.findPrevious('h3')
if parents == level + 1:
children.append(str(''.join(header.findAll(text=True))))
return children
def genHeaderTree(browser, theSoup):
iterSoup = theSoup.findAll('dl')
if browser == "chrome" or browser == "firefox":
headerList = theSoup.findAll('h3')
iterSoup = iterSoup[1:]
else:
headerList = theSoup.findAll(re.compile("h?"))
firstHeader = str(headerList[0].text)
headerTree = Tree()
headerTree.add_node(firstHeader)
for item in iterSoup:
parents = len(item.findParents('dl'))
children = getChildren(item, parents)
if children:
for child in children:
if browser == "chrome" or browser == "firefox":
parent = str(item.findPrevious('h3').text)
else:
parent = str(item.findPrevious(re.compile("h?")).text)
headerTree.add_node(child, parent)
return headerTree
def printHeaderList(browser, theTree, theSoup, linkList):
if browser == "chrome" or browser == "firefox":
headerList = theSoup.findAll('h3')
else:
headerList = theSoup.findAll(re.compile("h?"))
firstHeader = ''.join(headerList[0].findAll(text=True))
iterTree = theTree.traverse(firstHeader, "depth")
removed = 0
if browser == "chrome" or browser == "firefox" or browser == "ie":
next(iterTree) # Remove "Bookmarks Toolbar" or "Bookmarks"
removed += 1
for node in iterTree:
temp = node
parents = 0
while temp.parent:
parents += 1
temp = temp.parent
prepend = "\t" * (parents - removed)
links = getLinks(browser, theSoup, node.identifier)
count = len(links)
percentage = "{0:.2f}%".format(((count + 0.0)/len(linkList)) * 100)
print (prepend + str(node.identifier) + " - " +
str(count) + " = " +percentage)
def getLinks(browser, theSoup, header):
s = None
# Total time: 2.00404 s - slowest part of the code
headerNodes = theSoup.findAll('h3')
for node in headerNodes:
if node.text == header:
s = node
break
while getattr(s, 'name', None) != 'dl':
if browser == "chrome" or browser == "ie":
s = s.nextSibling
elif browser == "firefox":
s = s.findNext('dl')
return s.findAll('a')
def populateList(linkList, urlType):
urlList = []
for link in linkList:
if urlType == "normal":
urlList.append(str(link['href']))
elif urlType == "noProtocol":
noProtocol = link['href'].split('://', 1)[-1]
urlList.append(str(noProtocol))
return urlList
def getDupes(inList):
return [item for item, c in collections.Counter(inList).items() if c > 1]
def checkStatus(link):
resp = requests.head(link)
return resp
def main():
supportedBrowsers = ["chrome", "firefox", "ie"]
browser = "chrome"
getCount = True
checkDupes = True
checkErrors = False
mySoup = createSoup(browser)
linkList = mySoup.find_all('a')
# Firefox only fix
for link in linkList[:]:
if link['href'].startswith('place'):
linkList.remove(link)
myTree = genHeaderTree(browser, mySoup)
if getCount:
total = len(linkList)
print "Total number of bookmarks: " + str(total) + "\n"
if checkDupes:
if any(browser in s for s in supportedBrowsers):
printHeaderList(browser, myTree, mySoup, linkList)
urlList = populateList(linkList, "normal")
dupes = getDupes(urlList)
print "\n\nDUPLICATE LINKS = " + str(len(dupes))
print "----------------"
for dupe in dupes:
print dupe
urlList = populateList(linkList, "noProtocol")
dupes = getDupes(urlList)
print "\nDUPLICATE LINKS (IGNORING PROTOCOL) = " + str(len(dupes))
print "------------------------------------"
for dupe in dupes:
print dupe
if checkErrors:
print "\nERROR CONNECTS"
print "---------------"
for link in linkList:
try:
response = checkStatus(link['href'])
except:
print "ERROR?!"
if response.status_code != 200:
print str(response.status_code) + " - " + link['href']
if __name__=="__main__":
main()
The script had to not only my total number of bookmarks, but also how many bookmarks I had in each folder (and sub-folders). This was not in any of the applications or plugins that I could find, but BookmarkStats does just that.
Additionally, I wanted duplicate checking in the same application, with some wiggle room. For now I have implemented ignoring the protocol (as I had some “dupes” that were normally ignored because they were HTTP vs HTTPS). In the future, I also plan on adding almost duplicates (maybe the URL slightly changed, or it is a similar URL to the same page).
I plan on adding support for more browsers, as well as some visualizations and even timeout checking in the future. That said, even for now this script has helped me drop down to 3508 bookmarks and counting!
As usual, the code and updates can always be found in my GitHub repository as well.
Ray Doyle is an avid pentester/security enthusiast/beer connoisseur who has worked in IT for almost 16 years now. From building machines and the software on them, to breaking into them and tearing it all down; he’s done it all. To show for it, he has obtained an OSCE, OSCP, eCPPT, GXPN, eWPT, eWPTX, SLAE, eMAPT, Security+, ICAgile CP, ITIL v3 Foundation, and even a sabermetrics certification!
He currently serves as a Senior Staff Adversarial Engineer for Avalara, and his previous position was a Principal Penetration Testing Consultant for Secureworks.
This page contains links to products that I may receive compensation from at no additional cost to you. View my Affiliate Disclosure page here. As an Amazon Associate, I earn from qualifying purchases.