Skip to content

Commit

Permalink
Merge pull request #45 from DanielFillol/daniel
Browse files Browse the repository at this point in the history
feat: implemented GetElementAttributeFromNode
  • Loading branch information
DanielFillol authored Feb 28, 2025
2 parents 8987350 + 9489630 commit 18937e3
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 16 deletions.
33 changes: 33 additions & 0 deletions goSpider.go
Original file line number Diff line number Diff line change
Expand Up @@ -1481,3 +1481,36 @@ func FindNodes(node *html.Node, nodeExpression string) ([]*html.Node, error) {
}
return nil, errors.New("could not find specified node")
}

// GetElementAttributeFromNode retrieves the value of a specified attribute from an element
// located using an XPath expression within a given HTML node.
// Parameters:
// - node: The root HTML node to search within.
// - xpathExpr: The XPath expression that identifies the target element.
// - attribute: The attribute name whose value you want to retrieve.
// Returns:
// - The attribute value as a string.
// - An error if the element or attribute cannot be found.
func GetElementAttributeFromNode(node *html.Node, xpathExpr, attribute string) (string, error) {
// Locate the element using the provided XPath expression.
target := htmlquery.FindOne(node, xpathExpr)
if target == nil {
return "", fmt.Errorf("failed to find element for XPath: %s", xpathExpr)
}

// Retrieve the attribute's value.
// Option 1: using a loop to search through the node's attributes.
for _, attr := range target.Attr {
if attr.Key == attribute {
return attr.Val, nil
}
}

// Option 2: using htmlquery.SelectAttr (if you prefer a one-liner)
// value := htmlquery.SelectAttr(target, attribute)
// if value != "" {
// return value, nil
// }

return "", fmt.Errorf("attribute %s not found in element", attribute)
}
50 changes: 34 additions & 16 deletions goSpider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,6 @@ func TestPrintHtml(t *testing.T) {

}

// TestParseStringToHtmlNode tests the ParseStringToHtmlNode function.
func TestParseStringToHtmlNode(t *testing.T) {
// Sample HTML string to parse
htmlString := "<html><head></head><body><h1>Hello, World!</h1></body></html>"
Expand Down Expand Up @@ -696,21 +695,40 @@ func TestDatepicker(t *testing.T) {
}
}

// Won't pass on test because 2FA requires input on the terminal by the user, for that reason alone the test will fail
//// TestLoginGoogle tests google single logon
//func TestLoginGoogle(t *testing.T) {
// profilePath := "/Users/USER_NAME/Library/Application Support/Google/Chrome/Profile 2\""
// nav := NewNavigator(profilePath)
// defer nav.Close()
//
// err := nav.LoginWithGoogle("", "")
// if err != nil {
// t.Errorf("LoginWithGoogle error: %v", err)
// }
//
//}

//Full Crawlers
func TestGetElementAttributeFromNode(t *testing.T) {
nav := NewNavigator("", true)
nav.DebugLogger = false

err := nav.OpenURL("https://www.jusbrasil.com.br/jurisprudencia/busca?q=tjsp&dateFrom=2000-01-01&dateTo=2000-01-31")
if err != nil {
t.Errorf("OpenURL error: %v", err)
return
}

htmlContent, err := nav.GetPageSource()
if err != nil {
t.Fatalf("FetchHTML error: %v", err)
}
if htmlContent == nil {
t.Error("FetchHTML returned empty content")
}

nodes, err := FindNodes(htmlContent, "//*[@id=\"__next\"]/main/div[3]/div/div/div/section/ul/li")
if err != nil {
t.Errorf("FindNodes error: %v", err)
}

var elements []string
for _, node := range nodes {
element, err := GetElementAttributeFromNode(node, "div/div/div/article/div/div/div[1]/h2/a", "href")
if err != nil {
t.Errorf("GetElementAttributeFromNode error: %v on node: %v", err, node)
}
elements = append(elements, element)
}

log.Println(elements)
}

func TestParallelRequests(t *testing.T) {
users := []Request{
Expand Down

0 comments on commit 18937e3

Please sign in to comment.